def train(self, data, y, iters=400): """ Data is a matrix of values with the rows being the data points and the columns the variables/features. y is the matrix of expected results - iters: maximum number of iterations to perform """ m = data.shape[0] data = np.hstack([np.ones((data.shape[0],1)), data]) theta0 = np.zeros(data.shape[1]) sigmoid_cache = SigmoidCache() log.info("Training the logistic regressor. Regularized = %s", self.regularize) if(self.regularize): ret = opt.fmin_ncg(regularized_error, theta0, regularized_gradient, args=(data, y, sigmoid_cache, self.tau), maxiter=iters) else: # BFGS gives me a numerical problems with the dataset ../datasets/logreg/ex2data1.txt # ret = opt.fmin_bfgs(error, theta0, gradient, # args=(data, y, sigmoid_cache), maxiter=iters) # this works ret = opt.fmin_ncg(error, theta0, gradient, args=(data, y, sigmoid_cache), maxiter=iters) # this works but does not use the gradient # ret = opt.fmin(error, theta0, args=(data, y, sigmoid_cache), maxiter=iters) self.trained_theta = ret log.info("Finished training theta %s", self.trained_theta)
def newton_cg(x0, f, f_prime, hessian): all_x_i = [x0[0]] all_y_i = [x0[1]] all_f_i = [f(x0)] def store(X): x, y = X all_x_i.append(x) all_y_i.append(y) all_f_i.append(f(X)) optimize.fmin_ncg(f, x0, f_prime, fhess=hessian, callback=store, avextol=1e-12) return all_x_i, all_y_i, all_f_i
def __init__(self, x, f, grad_f, hess_f, maxiter=None, tol=1e-7, verbose=False): t0 = time() stuff = fmin_ncg( f, x, grad_f, fhess=hess_f, args=(), maxiter=maxiter, avextol=tol, full_output=True, disp=verbose ) self.x, self.fval = stuff[0], stuff[1] self.time = time() - t0
def Opti(a, b, c): # min_w a * w'*w + b'*w + c \sum_i{ 1/lam/e^gam * [ lam ti^p exp (zi) - di *( ln lam + ln p + (p-1) ln ti + zi ) ] # zi = beta' xi # the SUM * lam * e^gam = def obj_value(ww): w = np.reshape(ww, (d, 1)) z = np.dot(X, ww) ez = expp(z) vl = ( a * np.dot(w.T, w) + np.dot(b, w) + c * (np.dot(lamtip, ez) - (np.dot(D, z) + ss + ss1)) / lam / math.exp(GAM) ) return float(vl) def obj_grad(ww): w = np.reshape(ww, (d, 1)) z = np.dot(X, w) gez = derexpp(z) gl = lamtip1 * gez - DT grad = 2 * a * w + np.reshape(b, (d, 1)) + c * np.dot(XT, gl) / lam / math.exp(GAM) return grad[:, 0] bopt = fmin_ncg(obj_value, np.zeros(d), fprime=obj_grad, disp=False) return bopt
def test_ncg_hessp(self, use_wrapper=False): """ Newton conjugate gradient with Hessian times a vector p """ if use_wrapper: opts = {'maxit': self.maxiter, 'disp': False} retval = optimize.minimize(self.func, self.startparams, method='Newton-CG', jac=self.grad, hess = self.hessp, args=(), options=opts, full_output=False, retall=False) else: retval = optimize.fmin_ncg(self.func, self.startparams, self.grad, fhess_p = self.hessp, args=(), maxiter=self.maxiter, full_output=False, disp=False, retall=False) params = retval err = abs(self.func(params) - self.func(self.solution)) #print "NCG: Difference is: " + str(err) assert_(err < 1e-6) # Ensure that function call counts are 'known good'; these are from # Scipy 0.7.0. Don't allow them to increase. assert_(self.funccalls == 7, self.funccalls) assert_(self.gradcalls <= 18, self.gradcalls) # 0.9.0 #assert_(self.gradcalls == 18, self.gradcalls) # 0.8.0 #assert_(self.gradcalls == 22, self.gradcalls) # 0.7.0 # Ensure that the function behaves the same; this is from Scipy 0.7.0 assert_(np.allclose(self.trace[3:5], [[-4.35700753e-07, -5.24869435e-01, 4.87527480e-01], [-4.35700753e-07, -5.24869401e-01, 4.87527774e-01]], atol=1e-6, rtol=1e-7), self.trace[:5])
def maximize(L, DL, D2L, x, method=None, disp=False): """Main function to perform numerical optimization. L, DL and D2L are the objective function and its derivative and Hessian, and x is the initial guess (current rating). It will attempt the maximization using four different methods, from fastest and least robust, to slowest and most robust. It returns the argmin, or None if an error occured.""" mL = lambda x: -L(x) mDL = lambda x: -DL(x) mD2L = lambda x: -D2L(x) # Newton Conjugate Gradient if method == None or method == 'ncg': func = lambda x0: opt.fmin_ncg(mL, x0, fprime=mDL, fhess=mD2L, disp=disp, full_output=True, avextol=1e-10) xm = check_max(func, x, 5, 'NCG', disp) if xm != None: return xm # Broyden-Fletcher-Goldfarb-Shanno if method == None or method == 'bfgs': func = lambda x0: opt.fmin_bfgs(mL, x0, fprime=mDL, disp=disp, full_output=True, gtol=1e-10) xm = check_max(func, x, 6, 'BFGS', disp) if xm != None: return xm # Powell if method == None or method == 'powell': func = lambda x0: opt.fmin_powell(mL, x0, disp=disp, full_output=True, ftol=1e-10) xm = check_max(func, x, 5, 'POWELL', disp) if xm != None: return xm # Downhill simplex (last resort) func = lambda x0: opt.fmin(mL, x0, disp=disp, full_output=True, ftol=1e-10) xm = check_max(func, x, 4, 'DOWNHILL_SIMPLEX', disp) return xm
def _fit_ncg(f, score, start_params, fargs, kwargs, disp=True, maxiter=100, callback=None, retall=False, full_output=True, hess=None): fhess_p = kwargs.setdefault('fhess_p', None) avextol = kwargs.setdefault('avextol', 1.0000000000000001e-05) epsilon = kwargs.setdefault('epsilon', 1.4901161193847656e-08) retvals = optimize.fmin_ncg(f, start_params, score, fhess_p=fhess_p, fhess=hess, args=fargs, avextol=avextol, epsilon=epsilon, maxiter=maxiter, full_output=full_output, disp=disp, retall=retall, callback=callback) if full_output: if not retall: xopt, fopt, fcalls, gcalls, hcalls, warnflag = retvals else: xopt, fopt, fcalls, gcalls, hcalls, warnflag, allvecs =\ retvals converged = not warnflag retvals = {'fopt': fopt, 'fcalls': fcalls, 'gcalls': gcalls, 'hcalls': hcalls, 'warnflag': warnflag, 'converged': converged} if retall: retvals.update({'allvecs': allvecs}) else: xopt = None return xopt, retvals
def fitGLM(X, Y, H, l, hl, sp, norm, of, lateral, num_neurons_to_estimate): num_pres, num_neurons = numpy.shape(Y) num_pres, kernel_size = numpy.shape(X) if H != None: (trash, hist_size) = numpy.shape(H) else: hist_size = 0 Ks = numpy.zeros((num_neurons, kernel_size + 2 + hist_size + lateral * (num_neurons - 1))) laplace = laplaceBias(numpy.sqrt(kernel_size), numpy.sqrt(kernel_size)) rpi = numpy.linalg.pinv(X.T * X + __main__.__dict__.get("RPILaplaceBias", 0.0001) * laplace) * X.T * Y for i in xrange(0, num_neurons_to_estimate): print i k0 = ( rpi[:, i].getA1().tolist() + [0, 0] + numpy.zeros((1, hist_size)).flatten().tolist() + numpy.zeros((1, lateral * (num_neurons - 1))).flatten().tolist() ) if lateral and H != None: HH = numpy.hstack((H, Y[:, :i], Y[:, i + 1 :])) elif lateral: HH = numpy.hstack((Y[:, :i], Y[:, i + 1 :])) else: HH = H glm = GLM(numpy.mat(X), numpy.mat(Y[:, i]), l * laplace, HH, hl, sp, norm, of=of) K = fmin_ncg(glm.func(), numpy.array(k0), glm.der(), fhess=glm.hess(), avextol=0.0000001, maxiter=200) Ks[i, :] = K return [Ks, rpi, glm]
def fmin_ncg(self, model, funcs, *args, **kwargs): efunc = self.efunc(model, funcs) gfunc = self.gfunc(model, funcs) hfunc = self.hfunc(model, funcs) result = optimize.fmin_ncg(efunc, model.coords, gfunc, fhess = hfunc, *args, **kwargs) model.coords = result[0] return result
def test_ncg_hessp(self): # Newton conjugate gradient with Hessian times a vector p. if self.use_wrapper: opts = {'maxiter': self.maxiter, 'disp': self.disp, 'return_all': False} retval = optimize.minimize(self.func, self.startparams, method='Newton-CG', jac=self.grad, hessp=self.hessp, args=(), options=opts)['x'] else: retval = optimize.fmin_ncg(self.func, self.startparams, self.grad, fhess_p=self.hessp, args=(), maxiter=self.maxiter, full_output=False, disp=self.disp, retall=False) params = retval assert_allclose(self.func(params), self.func(self.solution), atol=1e-6) # Ensure that function call counts are 'known good'; these are from # Scipy 0.7.0. Don't allow them to increase. assert_(self.funccalls == 7, self.funccalls) assert_(self.gradcalls <= 18, self.gradcalls) # 0.9.0 # assert_(self.gradcalls == 18, self.gradcalls) # 0.8.0 # assert_(self.gradcalls == 22, self.gradcalls) # 0.7.0 # Ensure that the function behaves the same; this is from Scipy 0.7.0 assert_allclose(self.trace[3:5], [[-4.35700753e-07, -5.24869435e-01, 4.87527480e-01], [-4.35700753e-07, -5.24869401e-01, 4.87527774e-01]], atol=1e-6, rtol=1e-7)
def minimize(fun, x0, jac=None, hess=None, *args, **kwargs): method = kwargs.pop("method", "Newton-CG") assert method == "Newton-CG" r = optimize.fmin_ncg(f=fun, x0=x0, fprime=jac, fhess=hess, full_output=True, *args, **kwargs) res = _Result() res.x, res.success, res.message = r[0], r[5] == 0, "unknown" return res
def __call__(self, net, input, target): from scipy.optimize import fmin_ncg #if 'disp' not in self.kwargs: # self.kwargs['disp'] = 0 x = fmin_ncg(self.fcn, self.x.copy(), fprime=self.grad, callback=self.step, **self.kwargs) self.x[:] = x return None
def test_ncg(self, use_wrapper=False): """ line-search Newton conjugate gradient optimization routine """ if use_wrapper: opts = {'maxit': self.maxiter, 'disp': False} retval = optimize.minimize(self.func, self.startparams, method='Newton-CG', jac=self.grad, args=(), options=opts, full_output=False, retall=False) else: retval = optimize.fmin_ncg(self.func, self.startparams, self.grad, args=(), maxiter=self.maxiter, full_output=False, disp=False, retall=False) params = retval assert_allclose(self.func(params), self.func(self.solution), atol=1e-6) # Ensure that function call counts are 'known good'; these are from # Scipy 0.7.0. Don't allow them to increase. assert_(self.funccalls == 7, self.funccalls) assert_(self.gradcalls <= 18, self.gradcalls) # 0.9.0 #assert_(self.gradcalls == 18, self.gradcalls) # 0.8.0 #assert_(self.gradcalls == 22, self.gradcalls) # 0.7.0 # Ensure that the function behaves the same; this is from Scipy 0.7.0 assert_allclose(self.trace[3:5], [[-4.35700753e-07, -5.24869435e-01, 4.87527480e-01], [-4.35700753e-07, -5.24869401e-01, 4.87527774e-01]], atol=1e-6, rtol=1e-7)
def par_est(self): start = np.dot(la.inv(spdot(self.x.T, self.x)), spdot(self.x.T, self.y)) flogl = lambda par: -self.ll(par) if self.optim == 'newton': fgrad = lambda par: self.gradient(par) fhess = lambda par: self.hessian(par) par_hat = newton(flogl, start, fgrad, fhess, self.maxiter) warn = par_hat[2] else: fgrad = lambda par: -self.gradient(par) if self.optim == 'bfgs': par_hat = op.fmin_bfgs( flogl, start, fgrad, full_output=1, disp=0) warn = par_hat[6] if self.optim == 'ncg': fhess = lambda par: -self.hessian(par) par_hat = op.fmin_ncg( flogl, start, fgrad, fhess=fhess, full_output=1, disp=0) warn = par_hat[5] if warn > 0: warn = True else: warn = False return par_hat, warn
def __updateD(X, A, D, R, nne, optfunc): f = 0 for i in range(len(X)): d = D[i, :] u = Updater(X[i], A, R) if nne > 0: bounds = len(d) * [(0, None)] res = fmin_l_bfgs_b( u.updateD_F, d, u.updateD_G, factr=1e12, bounds=bounds ) else: if optfunc == 'lbfgs': res = fmin_l_bfgs_b(u.updateD_F, d, u.updateD_G, factr=1e12) D[i, :] = res[0] f += res[1] elif optfunc == 'ncg': res = fmin_ncg( u.updateD_F, d, u.updateD_G, fhess=u.updateD_H, full_output=True, disp=False ) # TODO: check return value of ncg and update D, f raise NotImplementedError() elif optfunc == 'tnc': res = fmin_tnc(u.updateD_F, d, u.updateD_G, disp=False) # TODO: check return value of tnc and update D, f raise NotImplementedError() return D, f
def test_ncg(self): """ line-search Newton conjugate gradient optimization routine """ retval = optimize.fmin_ncg(self.func, self.startparams, self.grad, args=(), maxiter=self.maxiter, full_output=False, disp=False, retall=False) params = retval err = abs(self.func(params) - self.func(self.solution)) #print "NCG: Difference is: " + str(err) assert_(err < 1e-6) # Ensure that function call counts are 'known good'; these are from # Scipy 0.7.0. Don't allow them to increase. assert_(self.funccalls == 7, self.funccalls) assert_(self.gradcalls <= 18, self.gradcalls) # 0.9.0 #assert_(self.gradcalls == 18, self.gradcalls) # 0.8.0 #assert_(self.gradcalls == 22, self.gradcalls) # 0.7.0 # Ensure that the function behaves the same; this is from Scipy 0.7.0 assert_(np.allclose(self.trace[3:5], [[-4.35700753e-07, -5.24869435e-01, 4.87527480e-01], [-4.35700753e-07, -5.24869401e-01, 4.87527774e-01]], atol=1e-6, rtol=1e-7), self.trace[:5])
def maximize(L, DL, D2L, x, method=None, disp=False): mL = lambda x: -L(x) mDL = lambda x: -DL(x) mD2L = lambda x: -D2L(x) if method == None or method == 'ncg': func = lambda x0: opt.fmin_ncg(mL, x0, fprime=mDL, fhess=mD2L,\ disp=disp, full_output=True,\ avextol=1e-10) xm = check_max(func, x, 5, 'NCG', disp) if xm != None: return xm if method == None or method == 'bfgs': func = lambda x0: opt.fmin_bfgs(mL, x0, fprime=mDL,\ disp=disp, full_output=True,\ gtol=1e-10) xm = check_max(func, x, 6, 'BFGS', disp) if xm != None: return xm if method == None or method == 'powell': func = lambda x0: opt.fmin_powell(mL, x0, disp=disp, full_output=True,\ ftol=1e-10) xm = check_max(func, x, 5, 'POWELL', disp) if xm != None: return xm func = lambda x0: opt.fmin(mL, x0, disp=disp, full_output=True, ftol=1e-10) xm = check_max(func, x, 4, 'DOWNHILL_SIMPLEX', disp) return xm
def infer_ctx(options, seq, f_cost, f_ctx_grad, init_ctx=None, f_hess_p=None, maxiter=100): if init_ctx == None: init_ctx = 1e-3 * numpy.random.randn(1, options["ctx_dim"]).astype("float32") x, mask, ctx0 = prepare_data([seq], init_ctx) def _g(ctx): return f_ctx_grad(x, mask, ctx.reshape([1, ctx.shape[0]]).astype("float32")).reshape([ctx.shape[0]]) def _c(ctx): return f_cost(x, mask, ctx.reshape([1, ctx.shape[0]]).astype("float32")) def _hp(ctx, p): if f_hess_p: return f_hess_p(x, mask, ctx.reshape([1, ctx.shape[0]]), p.reshape([1, p.shape[0]])).astype("float32") else: return None def _cb(ctx): cc = f_cost(x, mask, ctx.reshape([1, ctx.shape[0]]).astype("float32")) print "Current cost: ", cc if f_hess_p: ctx_opt = optimize.fmin_ncg(_c, ctx0[0, :], fprime=_g, fhess_p=_hp, callback=None, maxiter=maxiter) else: ctx_opt = optimize.fmin_bfgs(_c, ctx0[0, :], fprime=_g, callback=None, maxiter=maxiter) return ctx_opt
def one_vs_all(X, y, num_labels, lamb): """ Trains multiple logistic regression classifiers. Args: X: Matrix of features. y: Vector of labels. num_labels: Number of classes. lamb: Regularization parameter. Returns: all_theta: Vector of regularized logistic regression parameters (one per class). Raises: An error occurs if the number of labels is 0. """ if (num_labels == 0): raise Error('num_labels = 0') num_train_ex = X.shape[0] num_features = X.shape[1] all_theta = numpy.zeros((num_labels, num_features+1)) ones_vec = numpy.ones((num_train_ex, 1)) aug_x = numpy.c_[ones_vec, X] for label_index in range(0, num_labels): theta_vec = numpy.zeros((num_features+1, 1)) theta_vec_flat = numpy.ndarray.flatten(theta_vec) y_arg = (numpy.equal(y, (label_index+1)*numpy.ones((num_train_ex, 1)))).astype(int) fmin_ncg_out = fmin_ncg(compute_cost, theta_vec_flat, fprime=compute_gradient, args=(aug_x, y_arg, num_train_ex, lamb), avextol=1e-10, epsilon=1e-10, maxiter=400, full_output=1) theta_opt = numpy.reshape(fmin_ncg_out[0], (1, num_features+1), order='F') all_theta[label_index, :] = theta_opt return all_theta
def minimize(func, x0, method='CG', options=None, jac=None, callback=None): method = method.lower() if 'disp' in options: disp = options['disp'] else: disp = False if 'maxiter' in options: maxiter = options['maxiter'] else: maxiter = None if method == 'nelder-mead': x = fmin(func=func, x0=x0, disp=disp, maxiter=maxiter, callback=callback) elif method == 'powell': x = fmin_powell(func=func, x0=x0, disp=disp, maxiter=maxiter, callback=callback) elif method == 'cg': x = fmin_cg(f=func, x0=x0, fprime=jac, disp=disp, maxiter=maxiter, callback=callback) elif method == 'bfgs': x = fmin_bfgs(f=func, x0=x0, fprime=jac, disp=disp, maxiter=maxiter, callback=callback) elif method == 'l-bfgs-b': d = ceil(1000000 / len(x0)) print(d) x, _, _ = fmin_l_bfgs_b(func=func, x0=x0, fprime=jac, disp=(d if disp else 0)) elif method == 'newton-cg': x = fmin_ncg(f=func, x0=x0, fprime=jac, disp=disp, maxiter=maxiter, callback=callback) class Result(object): def __init__(self, x): self.x = x return Result(x)
def run(self): optimizer = self.optimizer p = self.problem f = p.f grad = p.grad # coerce return types f = lambda wt: numpy.float64(p.f(wt)) grad = lambda wt: numpy.array(list(map(numpy.float64, p.grad(wt)))) # negate for minimization neg_f = lambda wt: -f(wt) neg_grad = lambda wt: -grad(wt) #if not useGrad or not p.useGrad(): neg_grad = None if not p.usef: neg_f = lambda wt: -p._fDummy(wt) log = logs.getlogger(self.__class__.__name__) if optimizer == "bfgs": params = dict([k_v for k_v in iter(self.optParams.items()) if k_v[0] in ["gtol", "epsilon", "maxiter"]]) if self.verbose: print("starting optimization with %s... %s\n" % (optimizer, params)) wt, f_opt, grad_opt, Hopt, func_calls, grad_calls, warn_flags = fmin_bfgs(neg_f, self.wt, fprime=neg_grad, full_output=True, **params) if self.verbose: print("optimization done with %s..." % optimizer) print("f-opt: %.16f\nfunction evaluations: %d\nwarning flags: %d\n" % (-f_opt, func_calls, warn_flags)) elif optimizer == "cg": params = dict([k_v1 for k_v1 in iter(self.optParams.items()) if k_v1[0] in ["gtol", "epsilon", "maxiter"]]) log.info("starting optimization with %s... %s" % (optimizer, params)) wt, f_opt, func_calls, grad_calls, warn_flags = fmin_cg(neg_f, self.wt, fprime=neg_grad, args=(), full_output=True, **params) log.info("optimization done with %s..." % optimizer) log.info("f-opt: %.16f\nfunction evaluations: %d\nwarning flags: %d\n" % (-f_opt, func_calls, warn_flags)) elif optimizer == "ncg": params = dict([k_v2 for k_v2 in iter(self.optParams.items()) if k_v2[0] in ["avextol", "epsilon", "maxiter"]]) log.info("starting optimization with %s... %s" % (optimizer, params)) wt, f_opt, func_calls, grad_calls, warn_flags = fmin_ncg(neg_f, self.wt, fprime=neg_grad, args=(), full_output=True, **params) log.info("optimization done with %s..." % optimizer) log.info("f-opt: %.16f\nfunction evaluations: %d\nwarning flags: %d\n" % (-f_opt, func_calls, warn_flags)) elif optimizer == "fmin": params = dict([k_v3 for k_v3 in iter(self.optParams.items()) if k_v3[0] in ["xtol", "ftol", "maxiter"]]) log.info("starting optimization with %s... %s" % (optimizer, params)) wt = fmin(neg_f, self.wt, args=(), full_output=True, **params) log.info("optimization done with %s..." % optimizer) elif optimizer == "powell": params = dict([k_v4 for k_v4 in iter(self.optParams.items()) if k_v4[0] in ["xtol", "ftol", "maxiter"]]) log.info("starting optimization with %s... %s" % (optimizer, params)) wt = fmin_powell(neg_f, self.wt, args=(), full_output=True, **params) log.info("optimization done with %s..." % optimizer) elif optimizer == 'l-bfgs-b': params = dict([k_v5 for k_v5 in iter(self.optParams.items()) if k_v5[0] in ["gtol", "epsilon", "maxiter", 'bounds']]) log.info("starting optimization with %s... %s" % (optimizer, params)) if 'bounds' in params: params['bounds'] = (params['bounds'],) * len(self.wt) wt, f_opt, d = fmin_l_bfgs_b(neg_f, self.wt, fprime=neg_grad, **params) log.info("optimization done with %s..." % optimizer) log.info("f-opt: %.16f\n" % (-f_opt)) else: raise Exception("Unknown optimizer '%s'" % optimizer) return wt
def find_energy_min(eptm, method='fmin_l_bfgs_b', tol=1e-8, approx_grad=0, epsilon=1e-8): ''' Performs the energy minimisation ''' pos0, bounds = precondition(eptm) eptm.stamp += 1 output = 0 if method == 'fmin_l_bfgs_b': ## I set `factr` to 1e11 to avoid too long computation output = optimize.fmin_l_bfgs_b(opt_energy, pos0.flatten(), fprime=opt_gradient, #approx_grad=approx_grad, bounds=bounds.flatten(), args=(eptm,), factr=1e10, m=10, pgtol=tol, epsilon=epsilon, iprint=1, maxfun=150, disp=None) elif method=='fmin': output = optimize.fmin(opt_energy, pos0.flatten(), ftol=tol, xtol=0.01, args=(eptm,), callback=opt_callback) elif method=='fmin_ncg': output = optimize.fmin_ncg(opt_energy, pos0.flatten(), fprime=opt_gradient, args=(eptm,), avextol=tol, retall=True, maxiter=100)# , elif method=='fmin_tnc': output = optimize.fmin_tnc(opt_energy, pos0.flatten(), fprime=opt_gradient, args=(eptm,), pgtol=tol, bounds=bounds, maxCGit=0, disp=5) elif method=='fmin_bfgs': output = optimize.fmin_bfgs(opt_energy, pos0.flatten(), fprime=opt_gradient, args=(eptm,), gtol=tol, norm=np.inf, retall=1, callback=opt_callback) return pos0, output
def maximize(self,*a): print 'Maximizing using Newton Conjugate Gradient method' self.iters=0 theta, args = self.model.pack(*a) theta = opt.fmin_ncg(f=self.logL, x0=theta, fprime=self.logL_grad, fhess_p=self.logL_hess_p, args=args, maxiter=None, avextol=1.0e-10, callback=self.callback) return self.model.unpack(theta, args)
def run(self): optimizer = self.optimizer p = self.problem f = p.f grad = p.grad # coerce return types f = lambda wt: numpy.float64(p.f(wt)) grad = lambda wt: numpy.array(map(numpy.float64, p.grad(wt))) # negate for minimization neg_f = lambda wt: -f(wt) neg_grad = lambda wt: -grad(wt) if not p.useGrad(): neg_grad = None if not p.useF(): neg_f = lambda wt: -p.__fDummy(wt) if optimizer == "bfgs": params = dict(filter(lambda (k,v): k in ["gtol", "epsilon", "maxiter"], self.optParams.iteritems())) print "starting optimization with %s... %s" % (optimizer, params) wt, f_opt, grad_opt, Hopt, func_calls, grad_calls, warn_flags = fmin_bfgs(neg_f, self.wt, fprime=neg_grad, full_output=True, **params) print "optimization done with %s..." % optimizer print "f-opt: %.16f\nfunction evaluations: %d\nwarning flags: %d\n" % (-f_opt, func_calls, warn_flags) elif optimizer == "cg": params = dict(filter(lambda (k,v): k in ["gtol", "epsilon", "maxiter"], self.optParams.iteritems())) print "starting optimization with %s... %s" % (optimizer, params) wt, f_opt, func_calls, grad_calls, warn_flags = fmin_cg(neg_f, self.wt, fprime=neg_grad, args=(), full_output=True, **params) print "optimization done with %s..." % optimizer print "f-opt: %.16f\nfunction evaluations: %d\nwarning flags: %d\n" % (-f_opt, func_calls, warn_flags) elif optimizer == "ncg": params = dict(filter(lambda (k,v): k in ["avextol", "epsilon", "maxiter"], self.optParams.iteritems())) print "starting optimization with %s... %s" % (optimizer, params) wt, f_opt, func_calls, grad_calls, warn_flags = fmin_ncg(neg_f, self.wt, fprime=neg_grad, args=(), full_output=True, **params) print "optimization done with %s..." % optimizer print "f-opt: %.16f\nfunction evaluations: %d\nwarning flags: %d\n" % (-f_opt, func_calls, warn_flags) elif optimizer == "fmin": params = dict(filter(lambda (k,v): k in ["xtol", "ftol", "maxiter"], self.optParams.iteritems())) print "starting optimization with %s... %s" % (optimizer, params) wt = fmin(neg_f, self.wt, args=(), full_output=True, **params) print "optimization done with %s..." % optimizer elif optimizer == "powell": params = dict(filter(lambda (k,v): k in ["xtol", "ftol", "maxiter"], self.optParams.iteritems())) print "starting optimization with %s... %s" % (optimizer, params) wt = fmin_powell(neg_f, self.wt, args=(), full_output=True, **params) print "optimization done with %s..." % optimizer elif optimizer == 'l-bfgs-b': params = dict(filter(lambda (k,v): k in ["gtol", "epsilon", "maxiter", 'bounds'], self.optParams.iteritems())) print "starting optimization with %s... %s" % (optimizer, params) if 'bounds' in params: params['bounds'] = (params['bounds'],) * len(self.wt) wt, f_opt, d = fmin_l_bfgs_b(neg_f, self.wt, fprime=neg_grad, **params) print "optimization done with %s..." % optimizer print "f-opt: %.16f\n" % (-f_opt) else: raise Exception("Unknown optimizer '%s'" % optimizer) return wt
def fit(self, X, y): X_c = np.column_stack((np.ones(X.shape[0]),X)) self.theta = np.zeros(X.shape[1]+1) if self.solver == 'newton-cg': self.theta = fmin_ncg(self.costFunction, self.theta, fprime=self.gradient, args=(X_c, y), maxiter=self.max_iter, avextol=self.tol, disp=self.verbose) else: pass self.cost = self.costFunction(self.theta, X_c, y)
def trainReg(X, y, lamd, lin=True): """If lin=True, train linear regression given datasets X, y and a regularization param\ lamd else train logistic regression""" if type(X) == pd.core.series.Series: init_theta = np.zeros(2) else: init_theta = np.zeros(X.shape[1] + 1) J, grad = for_opt_wrapper(regCostFunction, X, y, lamd, lin) opt_theta = fmin_ncg(J, init_theta, grad, maxiter=1000) print opt_theta return opt_theta # this is numpy array its shape is (2,)
def map(self, tol=1e-8): """ Compute the maximum a posteriori regression coefficients. """ cost = lambda w: -self.log_posterior(w) grad = lambda w: -self.log_posterior_grad(w) hess = lambda w: -self.log_posterior_hess(w) w0 = np.zeros(self.X.shape[1]) w = fmin_ncg(cost, w0, grad, fhess=hess, avextol=tol, disp=False) self.cache['map'] = w return w
def test(): N = 10 Qr = np.r_[1:3] init = np.ones(N) J = lambda phi: objective(phi, Qr) dJ = lambda phi: gradient(phi, Qr) d2J = lambda phi: hessian(phi, Qr) def callback(phi): print(J(phi)) phi = fmin_ncg(J, init, fprime=dJ, fhess=d2J, avextol=1e-16, callback=callback) return np.exp(1j*phi)
def laplace_approximation(self): """find the mode and hessian of the (probabiliy of) f, the latent function variables""" self.update() #self.f_hat = fmin(self.fcost,self.Y.copy().flatten()+np.random.randn(self.N)) #self.f_hat = fmin_cg(self.fcost,self.f_hat.copy().flatten(),fprime=self.fcost_grad) try: self.f_hat = fmin_ncg(self.fcost,self.f_hat.copy().flatten(),fprime=self.fcost_grad,fhess=self.fcost_hessian) except(ValueError): print 'ncg method barfed' self.f_hat = fmin_cg(self.fcost,self.f_hat.copy().flatten(),fprime=self.fcost_grad) self.f_hat = self.f_hat.reshape(self.N,1)
def test_ncg(self): """ line-search Newton conjugate gradient optimization routine """ retval = optimize.fmin_ncg(self.func, self.startparams, self.grad, args=(), maxiter=self.maxiter, full_output=False, disp=False, retall=False) params = retval err = abs(self.func(params) - self.func(self.solution)) #print "NCG: Difference is: " + str(err) assert err < 1e-6
def get_inverse_hvp_cg(self, v, max_iterations=10, grad=None): if (isinstance(v, torch.Tensor)): v = v.detach().numpy() self.initialize(self.X_train, self.Y_train, grad=grad) fmin_loss_fn = self.get_fmin_loss_fn(v) fmin_grad_fn = self.get_fmin_grad_fn(v) cg_callback = self.get_cg_callback(v) fmin_results = fmin_ncg(f=fmin_loss_fn, x0=v, fprime=fmin_grad_fn, fhess_p=self.get_fmin_hvp, callback=cg_callback, avextol=1e-8, maxiter=max_iterations) return fmin_results
def test_ncg_hessp(self, use_wrapper=False): """ Newton conjugate gradient with Hessian times a vector p """ if use_wrapper: opts = { 'maxiter': self.maxiter, 'disp': False, 'return_all': False } retval = optimize.minimize(self.func, self.startparams, method='Newton-CG', jac=self.grad, hessp=self.hessp, args=(), options=opts)['x'] else: retval = optimize.fmin_ncg(self.func, self.startparams, self.grad, fhess_p=self.hessp, args=(), maxiter=self.maxiter, full_output=False, disp=False, retall=False) params = retval assert_allclose(self.func(params), self.func(self.solution), atol=1e-6) # Ensure that function call counts are 'known good'; these are from # Scipy 0.7.0. Don't allow them to increase. assert_(self.funccalls == 7, self.funccalls) assert_(self.gradcalls <= 18, self.gradcalls) # 0.9.0 # assert_(self.gradcalls == 18, self.gradcalls) # 0.8.0 # assert_(self.gradcalls == 22, self.gradcalls) # 0.7.0 # Ensure that the function behaves the same; this is from Scipy 0.7.0 assert_allclose(self.trace[3:5], [[-4.35700753e-07, -5.24869435e-01, 4.87527480e-01], [-4.35700753e-07, -5.24869401e-01, 4.87527774e-01]], atol=1e-6, rtol=1e-7)
def get_inverse_hvp_cg(self, v, verbose): fmin_loss_fn = self.get_fmin_loss_fn(v) fmin_grad_fn = self.get_fmin_grad_fn(v) cg_callback = self.get_cg_callback(v, verbose) # x0 = np.array([]) # for param in v: # x0 = np.concatenate([x0,param.flatten()]) fmin_results = fmin_ncg( f=fmin_loss_fn, x0=np.concatenate(v), fprime=fmin_grad_fn, fhess_p=self.get_fmin_hvp, callback=cg_callback, avextol=self.avextol, maxiter=100) return self.vec_to_list(fmin_results)
def main(): # get the training and the test data X_train, X_test, y_train, targets_train, targets_test = loadData('iris.data') # normalize the features of the training and the test set X_train, mu, std = featureNormalize(X_train) X_test = (X_test - mu) / std # useful parameters input_layer_size = 4 hidden_layer_size = 3 output_layer_size = 3 Lambda = 0.1 # regularization parameter # randomly initialize the weights from a uniform distribution epsilon1 = np.sqrt(6.0/(input_layer_size+hidden_layer_size)) epsilon2 = np.sqrt(6.0/(hidden_layer_size+output_layer_size)) initial_theta1 = np.random.uniform(-1*epsilon1, epsilon1, size=(hidden_layer_size,input_layer_size+1)) initial_theta2 = np.random.uniform(-1*epsilon2, epsilon2, size=(output_layer_size,hidden_layer_size+1)) # set the parameters for training the neural network initial_params = np.r_[initial_theta1.ravel(), initial_theta2.ravel()] args = (X_train, y_train, input_layer_size, hidden_layer_size, output_layer_size, Lambda) # train the neural network theta = optimize.fmin_ncg(computeCostReg, initial_params, fprime=computeGradientReg, args=args) # obtain the optimal weights theta1 = theta[:hidden_layer_size*(input_layer_size+1)].reshape(hidden_layer_size,input_layer_size+1) theta2 = theta[hidden_layer_size*(input_layer_size+1):].reshape(output_layer_size,hidden_layer_size+1) # get the predictions on the training and the test set prediction_train = predict(theta1, theta2, X_train) prediction_test = predict(theta1, theta2, X_test) # calculate the training and test set accuracy training_acc = np.mean(targets_train == prediction_train) * 100 test_acc = np.mean(targets_test == prediction_test) * 100 # report the results print 'The training set accuracy of the neural network: %.2f%%' % training_acc print 'The test set accuracy of the neural network: %.2f%%' % test_acc
def trainLinearReg(X, y, lambd=0): # TRAINLINEARREG Trains linear regression given a dataset (X, y) and a # regularization parameter lambd # TRAINLINEARREG (X, y, lambda) trains linear regression using # the dataset (X, y) and regularization parameter lambd. Returns the # trained parameters theta. # # Initialize some useful values m, n = X.shape initial_theta = np.zeros((n + 1, ), dtype=float) # Run Newton-Conjugate-Gradient to obtain the optimal theta theta = op.fmin_ncg(f=linearRegCostFunction, x0=initial_theta, fprime=linearRegGradient, maxiter=200, args=(X, y, lambd)) return theta
def conjugate_gradient(ax_fn, b, debug_callback=None, avextol=None, maxiter=None): """Computes the solution to Ax - b = 0 by minimizing the conjugate objective f(x) = x^T A x / 2 - b^T x. This does not require evaluating the matrix A explicitly, only the matrix vector product Ax. From https://github.com/kohpangwei/group-influence-release/blob/master/influence/conjugate.py. Args: ax_fn: A function that return Ax given x. b: The vector b. debug_callback: An optional debugging function that reports the current optimization function. Takes two parameters: the current solution and a helper function that evaluates the quadratic and linear parts of the conjugate objective separately. (Default value = None) avextol: (Default value = None) maxiter: (Default value = None) Returns: The conjugate optimization solution. """ cg_callback = None if debug_callback: cg_callback = lambda x: debug_callback(x, -np.dot(b, x), 0.5 * np.dot( x, ax_fn(x))) result = fmin_ncg( f=lambda x: 0.5 * np.dot(x, ax_fn(x)) - np.dot(b, x), x0=np.zeros_like(b), fprime=lambda x: ax_fn(x) - b, fhess_p=lambda x, p: ax_fn(p), callback=cg_callback, avextol=avextol, maxiter=maxiter, ) return result
def argmin_f(f, rho, A, AT, u, c): M, N = A.shape sq = lambda _x: np.array([safedot(A[i], _x) - c[i] for i in range(M)]) pr = lambda _sqx: np.dot(u, _sqx) + rho / 2 * sum(_sqx**2) fs = lambda _x: sum(f(_x)) obj = lambda _x: -fs(_x) + pr(sq(_x)) uTA = [np.dot(AT[i], u) for i in range(N)] pprime = lambda _x: np.array(uTA) ppprime = lambda _sqx: np.array([np.dot(AT[j], _sqx) for j in range(N)]) fprime = lambda _x: -1/np.maximum(_x, 1e-4) + 1e2 * np.sign(np.minimum(_x - 1e-4, 0)) jac = lambda _x: fprime(_x) + pprime(_x) + rho * ppprime(sq(_x)) x = np.ones(N) b = np.asarray(A)[0] xs = fmin_ncg(obj, x, fprime=jac, disp=0) #bounds=[(0, np.inf) for i in range(N)], disp=0) return xs
def __solver__(self, p): def iterfcn(x): p.xk, p.fk = x, p.f(x) p.iterfcn() if p.istop: raise isSolved if p.userProvided.d2f: fhess = p.d2f else: fhess = None xf = fmin_ncg(p.f, p.x0, p.df, fhess=fhess, maxiter=p.maxIter + 15, disp=0, callback=iterfcn) ff = p.f(xf) p.istop = 1000 p.xk = p.xf = xf p.fk = p.ff = ff
def one_vs_all(X, y, num_labels, lamb): """ Trains multiple logistic regression classifiers. Args: X: Matrix of features. y: Vector of labels. num_labels: Number of classes. lamb: Regularization parameter. Returns: all_theta: Vector of regularized logistic regression parameters (one per class). Raises: An error occurs if the number of labels is 0. """ if (num_labels == 0): raise Error('num_labels = 0') num_train_ex = X.shape[0] num_features = X.shape[1] all_theta = numpy.zeros((num_labels, num_features + 1)) ones_vec = numpy.ones((num_train_ex, 1)) aug_x = numpy.c_[ones_vec, X] for label_index in range(0, num_labels): theta_vec = numpy.zeros((num_features + 1, 1)) theta_vec_flat = numpy.ndarray.flatten(theta_vec) y_arg = (numpy.equal(y, (label_index + 1) * numpy.ones( (num_train_ex, 1)))).astype(int) fmin_ncg_out = fmin_ncg(compute_cost, theta_vec_flat, fprime=compute_gradient, args=(aug_x, y_arg, num_train_ex, lamb), avextol=1e-10, epsilon=1e-10, maxiter=400, full_output=1) theta_opt = numpy.reshape(fmin_ncg_out[0], (1, num_features + 1), order='F') all_theta[label_index, :] = theta_opt return all_theta
def train_with_fmin(self, train_feed_dict, save_checkpoints=True, verbose=True): fmin_loss_fn = self.get_train_fmin_loss_fn(train_feed_dict) fmin_grad_fn = self.get_train_fmin_grad_fn(train_feed_dict) fmin_hvp_fn = self.get_train_fmin_hvp_fn(train_feed_dict) x0 = np.array(self.sess.run(self.params)[0]) # fmin_results = fmin_l_bfgs_b( # # fmin_results = fmin_cg( # fmin_loss_fn, # x0, # fmin_grad_fn # # gtol=1e-8 # ) fmin_results = fmin_ncg( f=fmin_loss_fn, x0=x0, fprime=fmin_grad_fn, fhess_p=fmin_hvp_fn, avextol=1e-8, maxiter=100) W = np.reshape(fmin_results, -1) params_feed_dict = {} params_feed_dict[self.W_placeholder] = W self.sess.run(self.set_params_op, feed_dict=params_feed_dict) if save_checkpoints: self.saver.save(self.sess, self.checkpoint_file, global_step=0) if verbose: # print('CG training took %s iter.' % model.n_iter_) print('After training with CG: ') results = self.print_model_eval() else: results = None return results
def test_newton_cg(): # Test that newton_cg gives same result as scipy's fmin_ncg rng = np.random.RandomState(0) A = rng.normal(size=(10, 10)) x0 = np.ones(10) def func(x): Ax = A.dot(x) return .5 * (Ax).dot(Ax) def grad(x): return A.T.dot(A.dot(x)) def hess(x, p): return p.dot(A.T.dot(A.dot(x.all()))) def func_grad_hess(x): return func(x), grad(x), lambda x: A.T.dot(A.dot(x)) assert_array_almost_equal( newton_cg(func_grad_hess, func, grad, x0, tol=1e-10), fmin_ncg(f=func, x0=x0, fprime=grad, fhess_p=hess))
def train_log_reg(X, y): """ Solves for optimal logistic regression weights. Args: X: Matrix of features. y: Vector of labels. Returns: theta: Vector of parameters for regularized logistic regression. """ num_features = X.shape[1] num_train_ex = X.shape[0] ones_vec = numpy.ones((num_train_ex, 1)) X_aug = numpy.c_[ones_vec, X] y_vec = numpy.reshape(y, (num_train_ex, 1)) theta_vec = numpy.zeros((num_features + 1, 1)) theta_vec_flat = numpy.ndarray.flatten(theta_vec) # f_min_ncg_out = fmin_ncg(compute_cost, theta_vec_flat, # fprime=compute_gradient, args=(X_aug, y_vec, # num_train_ex), # avextol=1e-10, epsilon=1e-10, maxiter=400, # full_output=1) # lamb = 0 lamb = 1 # print("Running logistic regression with lamb = %.3f..." % lamb) f_min_ncg_out = fmin_ncg(compute_cost_reg, theta_vec_flat, fprime=compute_gradient_reg, args=(X_aug, y_vec, num_train_ex, lamb), avextol=1e-7, epsilon=1e-7, maxiter=400, full_output=1, disp=0) theta_opt = numpy.reshape(f_min_ncg_out[0], (num_features + 1, 1), order='F') # print("theta:") # print("%s\n" % numpy.array_str(numpy.round(theta_opt, 6))) return theta_opt
def get_inverse_hvp_cg(self, v, tol=1e-5, max_iter=1000): """ :param v: :param tol: :param max_iter: :return: """ def __cg_objective(x): Hx = self.eval_hvp(x) obj = np.multiply(0.5, x.T.dot(Hx)) - v.T.dot(x) # d0, = obj.shape return obj def __cg_grad(x): Hx = self.eval_hvp(x) d0, d1 = Hx.shape return (Hx - v).reshape((d0*d1,)) def __cg_fHess_p(x, p): Hp = self.eval_hvp(p) d0, d1 = Hp.shape return Hp.reshape((d0*d1,)) def __cg_callback(x): print('CG Objective: %s' % __cg_objective(x)[0]) cg_min_results = fmin_ncg( f=__cg_objective, x0=np.concatenate(v), fprime=__cg_grad, fhess_p=__cg_fHess_p, callback=__cg_callback, avextol=tol, maxiter=max_iter) return cg_min_results
def part_three_linear(X, y, theta, print_output=True): # Set options for minimization function kwargs = {'maxiter': 400, 'args': (X, y), 'full_output': True} ## This function will return theta and the cost ## Tried a few different minimization functions in python. They all worked #theta, nf, rc = optimize.fmin_tnc(func=compute_cost, x0=theta, fprime=compute_grad, args=(X, y)) #theta, nf, rc = optimize.fmin_tnc(func=cost_function, x0=theta, args=(X, y)) #cost = compute_cost(theta, X, y) #theta, cost, go, bo, nf, ng, w = optimize.fmin_bfgs(f=compute_cost, x0=theta, fprime=compute_grad, **kwargs) theta, cost, nf, gf, hf, w = optimize.fmin_ncg(f=compute_cost, x0=theta, fprime=compute_grad, **kwargs) # Print theta to screen if print_output: print('Cost at theta found by fminunc: {:f}'.format(cost)) print('Expected cost (approx): 0.203') print('theta:') print(' {}'.format(theta)) print('Expected theta (approx):\n -25.161\n 0.206\n 0.201\n') # Plot Boundary plot_decision_boundary( theta, X, y, labels=['Admitted', 'Not Admitted', 'Decision Boundary']) # Put some labels plt.xlabel('Exam 1 Score', size=18) plt.ylabel('Exam 2 Score', size=18) plt.gca().tick_params(labelsize=14) plt.show() #raw_input('Program paused. Press key to continue.\n') return theta
def _fit_ncg(self, X, y, X_val, Y_val, activations, deltas, coef_grads, intercept_grads, layer_units): # Store meta information for the parameters self._coef_indptr = [] self._intercept_indptr = [] start = 0 # Save sizes and indices of coefficients for faster unpacking for i in range(self.n_layers_ - 1): n_fan_in, n_fan_out = layer_units[i], layer_units[i + 1] end = start + (n_fan_in * n_fan_out) self._coef_indptr.append((start, end, (n_fan_in, n_fan_out))) start = end # Save sizes and indices of intercepts for faster unpacking for i in range(self.n_layers_ - 1): end = start + layer_units[i + 1] self._intercept_indptr.append((start, end)) start = end # Run Newton-CG packed_coef_inter = _pack(self.coefs_, self.intercepts_) optimal_parameters, self.loss_, func_calls, grad_calls, h_calls, d = \ optimize.fmin_ncg(x0=packed_coef_inter, f=self._loss_func, fprime=self._grad_func, maxiter=200, # maxiter=self.max_iter, disp=True, args=(X, y, activations, deltas, coef_grads, intercept_grads), callback=self._callback, full_output=True) self._unpack(optimal_parameters)
def func(self, thetas_p, max_iter, n, c, X_p, y_p, C): initial_theta = np.zeros((n + 1, 1), dtype=np.float64) args = [X_p[c], y_p[c], C] print('Iter: ', c) #theta= optimize.fmin_cg(self.cost_func, initial_theta, fprime = self.grad_cost_func, args = args, maxiter=max_iter) if self.solver == 'fmincg': theta = optimize.fmin_cg(self.cost_func, initial_theta, fprime=self.grad_cost_func, args=args, maxiter=self.max_iter) elif self.solver == 'newton-cg': theta = optimize.fmin_ncg(self.cost_func, initial_theta, fprime=self.grad_cost_func, args=args, maxiter=self.max_iter) elif self.solver == 'lbfgs': theta = optimize.fmin_l_bfgs_b(self.cost_func, initial_theta, fprime=self.grad_cost_func, args=args, maxiter=self.max_iter)[0] thetas_p[c] = theta.transpose()
def fit(self, X, y,n, m): #labels = set(y) #n_labels = len(labels) #encoder = dict(zip(labels, np.arange(float(n_labels)))) #self.decoder = dict(zip(np.arange(float(n_labels)), labels)) #n = X.shape[1] #m = X.shape[0] X_aux = np.concatenate((np.ones((m ,1), dtype = np.float64), X), axis=1) initial_theta = np.zeros((n + 1, 1), dtype=np.float64) theta = np.zeros((n + 1, 1), dtype=np.float64) #y_enc = np.array(list(map(lambda x : encoder[x], y))) y_enc = np.array(y) args = [X_aux, y_enc, self.C] self.all_theta = np.zeros((1, n + 1), dtype=np.float64) if self.solver == 'fmincg': theta= optimize.fmin_cg(self.cost_func, initial_theta, fprime = self.grad_cost_func, args = args, maxiter=self.max_iter) elif self.solver == 'newton-cg': theta= optimize.fmin_ncg(self.cost_func, initial_theta, fprime = self.grad_cost_func, args = args, maxiter=self.max_iter) elif self.solver == 'lbfgs': theta= optimize.fmin_l_bfgs_b(self.cost_func, initial_theta, fprime = self.grad_cost_func, args = args, maxiter=self.max_iter)[0] #print(theta) self.all_theta = theta.transpose()
def minimize(func, x0, gradient=None, hessian=None, algorithm="default", verbose=False, **args): r""" This function is an interface to a variety of algorithms for computing the minimum of a function of several variables. INPUT: - ``func`` -- Either a symbolic function or a Python function whose argument is a tuple with `n` components - ``x0`` -- Initial point for finding minimum. - ``gradient`` -- Optional gradient function. This will be computed automatically for symbolic functions. For Python functions, it allows the use of algorithms requiring derivatives. It should accept a tuple of arguments and return a NumPy array containing the partial derivatives at that point. - ``hessian`` -- Optional hessian function. This will be computed automatically for symbolic functions. For Python functions, it allows the use of algorithms requiring derivatives. It should accept a tuple of arguments and return a NumPy array containing the second partial derivatives of the function. - ``algorithm`` -- String specifying algorithm to use. Options are ``'default'`` (for Python functions, the simplex method is the default) (for symbolic functions bfgs is the default): - ``'simplex'`` -- using the downhill simplex algorithm - ``'powell'`` -- use the modified Powell algorithm - ``'bfgs'`` -- (Broyden-Fletcher-Goldfarb-Shanno) requires gradient - ``'cg'`` -- (conjugate-gradient) requires gradient - ``'ncg'`` -- (newton-conjugate gradient) requires gradient and hessian - ``verbose`` -- (optional, default: False) print convergence message .. NOTE:: For additional information on the algorithms implemented in this function, consult SciPy's `documentation on optimization and root finding <https://docs.scipy.org/doc/scipy/reference/optimize.html>`_ EXAMPLES: Minimize a fourth order polynomial in three variables (see the :wikipedia:`Rosenbrock_function`):: sage: vars = var('x y z') sage: f = 100*(y-x^2)^2+(1-x)^2+100*(z-y^2)^2+(1-y)^2 sage: minimize(f, [.1,.3,.4]) # abs tol 1e-6 (1.0, 1.0, 1.0) Try the newton-conjugate gradient method; the gradient and hessian are computed automatically:: sage: minimize(f, [.1, .3, .4], algorithm="ncg") # abs tol 1e-6 (1.0, 1.0, 1.0) We get additional convergence information with the `verbose` option:: sage: minimize(f, [.1, .3, .4], algorithm="ncg", verbose=True) Optimization terminated successfully. ... (0.9999999..., 0.999999..., 0.999999...) Same example with just Python functions:: sage: def rosen(x): # The Rosenbrock function ....: return sum(100.0r*(x[1r:]-x[:-1r]**2.0r)**2.0r + (1r-x[:-1r])**2.0r) sage: minimize(rosen, [.1,.3,.4]) # abs tol 3e-5 (1.0, 1.0, 1.0) Same example with a pure Python function and a Python function to compute the gradient:: sage: def rosen(x): # The Rosenbrock function ....: return sum(100.0r*(x[1r:]-x[:-1r]**2.0r)**2.0r + (1r-x[:-1r])**2.0r) sage: import numpy sage: from numpy import zeros sage: def rosen_der(x): ....: xm = x[1r:-1r] ....: xm_m1 = x[:-2r] ....: xm_p1 = x[2r:] ....: der = zeros(x.shape, dtype=float) ....: der[1r:-1r] = 200r*(xm-xm_m1**2r) - 400r*(xm_p1 - xm**2r)*xm - 2r*(1r-xm) ....: der[0] = -400r*x[0r]*(x[1r]-x[0r]**2r) - 2r*(1r-x[0]) ....: der[-1] = 200r*(x[-1r]-x[-2r]**2r) ....: return der sage: minimize(rosen, [.1,.3,.4], gradient=rosen_der, algorithm="bfgs") # abs tol 1e-6 (1.0, 1.0, 1.0) """ from sage.symbolic.expression import Expression from sage.ext.fast_eval import fast_callable import numpy from scipy import optimize if isinstance(func, Expression): var_list = func.variables() var_names = [str(_) for _ in var_list] fast_f = fast_callable(func, vars=var_names, domain=float) f = lambda p: fast_f(*p) gradient_list = func.gradient() fast_gradient_functions = [ fast_callable(gradient_list[i], vars=var_names, domain=float) for i in range(len(gradient_list)) ] gradient = lambda p: numpy.array( [a(*p) for a in fast_gradient_functions]) else: f = func if algorithm == "default": if gradient is None: min = optimize.fmin(f, [float(_) for _ in x0], disp=verbose, **args) else: min = optimize.fmin_bfgs(f, [float(_) for _ in x0], fprime=gradient, disp=verbose, **args) else: if algorithm == "simplex": min = optimize.fmin(f, [float(_) for _ in x0], disp=verbose, **args) elif algorithm == "bfgs": min = optimize.fmin_bfgs(f, [float(_) for _ in x0], fprime=gradient, disp=verbose, **args) elif algorithm == "cg": min = optimize.fmin_cg(f, [float(_) for _ in x0], fprime=gradient, disp=verbose, **args) elif algorithm == "powell": min = optimize.fmin_powell(f, [float(_) for _ in x0], disp=verbose, **args) elif algorithm == "ncg": if isinstance(func, Expression): hess = func.hessian() hess_fast = [[ fast_callable(a, vars=var_names, domain=float) for a in row ] for row in hess] hessian = lambda p: [[a(*p) for a in row] for row in hess_fast] hessian_p = lambda p, v: scipy.dot(numpy.array(hessian(p)), v) min = optimize.fmin_ncg(f, [float(_) for _ in x0], fprime=gradient, \ fhess=hessian, fhess_p=hessian_p, disp=verbose, **args) return vector(RDF, min)
def find_ML_Estimator(image, fitParams, outputHandle=None, setParams=None, modelLookup=None, searchMethod='simplex', preSearchMethod=None, Prior=None, bruteRange=None, biasCorrect=0, calcNoise=None, bcoutputHandle=None, error='Fisher', **iParams): import scipy.optimize as opt import model_Production as modPro from surface_Brightness_Profiles import gaussian_SBProfile_CXX import measure_Bias as mBias from generalManipulation import makeIterableList """ MAIN ROUTINE FOR THIS MODULE. Takes in an image (at minimum) and a set of values which defines the model parameters (fit and those which are free to vary), and returns the parameter values at which the log-Likelihood is minimised (or Likelihood is maximised). Can correct for first order noise bias (if biasCorrect != 0), and an estimate of the error (if error is equal to a set of pre-defined values [see below]). Requires: -- image: 2d array of pixelised image -- fitParams: tuple of strings which define the model parameters which are free to vary (those which will be fit). These must satisfy the definition of model parameters as set out in the default model dictionary. If None, then e1, e2 and T are fit (this could be made stricter by removing the default None initialisation, thereby requiring that a set of parameters to be fit is passed in). -- outputHandle: handle of the output file. **Result is always appended**. If not passed in, then result is not output. Output is in ASCII form. -- setParams: Default model dictionary containing fixed parameters which describes the model being fixed. One part of a two part approach to setting the full model parameter dictionary, along with iParams. If None, then default model dictionary is taken. -- modelLookup: Dictionary containing lookup table for pixelised model images, as defined in model_Production module. If None, no lookup is used, and the model is re-evalauted for each change in model parameters. -- searchMethod: String detailing which form of minimisation to use. Accepted values are: ___ simplex, brent, powell, cg, bfgs, l_bfgs_b, ncg (as defined in SciPy documentation) -- preSearchMethod: String detailing initial search over parameter space to find global Minimium, used as an initial guess for refinement with searchMethod. If None, initial guess is set to default passed in by the combination of setParams and iParams. If not None, then code will run an initial, coarse search over the parameter space to attempt to find the global mimima. By default this is switched off. Where preSearchMethod == grid or brute, the a grid based search is used. Where this is used, a range must either be entered by the user through bruteRange, or it is taken from the entered prior information. NOTE: This still uses a typically coarse grid, therefore if the range is too wide then it is possible that the code may still find a local mimimum if this exists within one grid point interval of the global miminum. -- Prior: NOT USED YET. Skeleton to allow for a parameter prior structure to be passed in -- bruteRange: [nPar, 2] sized tuple setting the range in which the initial preSearchMethod is evaluated, if this is done using a grid or brute method (both equivalent), where nPar is the number of free model parameters being fit. THIS DOES NOT CONSTITUTE A PRIOR, as the refinement may still find an ML value outside this range, however where the global maximum occurs outside this range the returned ML value may be expected to be biased. -- biasCorrect: integer, states what level of noise bias to correct the estimate to. Only 1st order correction (biasCorrect == 1) is supported. If biasCorrect == 0, the uncorrected estimate (and error if applicable) are output. If biasCorrect > 0, the uncorrected, corrected and error (if applicable) are output. When used, it is important that *the entered model parameter dictionary contains an accurate measure of the pixel noise of appropriate signal--to--noise, as the analytic bias scales according to both*. Noise can be estimate using estimate_Noise() before entry. -- bcOutputhandle: As outputHandle, except for the bias corrected estimator. -- error: String detailing error estiamte to output. Supported values are: ___ fisher: Marginalised fisher error for each parameter around the ML point. See docstring for fisher_Error_ML(). ___ brute: UNSUPPORTED, however an error defined on the parameter likelihood itself can be derived if the preSearchMethod and bruteRange is defined such that the Likelihood has *compact support*. If not, then this would be inaccurate (underestimated). Therefore coding for this is deferred until the application of a prior is developed, as use of a prior ensures compact support by default. -- iParams: set of optional arguments which, together with setParams, defines the intial model dictionary. Allows parameter values to be input individually on call, and is particularly useful for setting initial guesses where preSearchMethod == None. Model Parameter entry: Model Parameters can be entered using two methods ___ setParams: Full Dictionary of initial guess/fixed value for set of parameters. If None, this is set to default set. May not be complete: if not, then model parameters set to default as given in default_ModelParameter_Dictionary() ___iParams: generic input which allows model parameters to be set individually. Keys not set are set to default as given by default_ModelParameter_Dictionary(). Where an iParams key is included in the default dictionary, or setParams, it will be updated to this value (**therefore iParams values have preferrence**). If key not present in default is entered, it is ignored ___ The initial choice of model parameters (including intial guesses for the minimisation routine where preSearchMethod == False) is thus set as setParams+{iParams} Returns: Returned: tuple of length equal to fitParams. Gives ML estimator for each fit parameter, with bias corrected version (if biasCorrect != 0) and error (if applicable) aslways in that order. """ ''' Set up defaults ''' ##Initialise result variables Returned = [] err = None ## Exceptions based on input objects if (image is None or sum(image.shape) == 0): raise RuntimeError( 'find_ML_Estimator - image supplied is None or uninitialised') if (len(fitParams) > 2 and modelLookup is not None and modelLookup['useLookup']): raise RuntimeError( 'find_ML_Estimator - Model Lookup is not supported for more than double parameter fits' ) ##Set up initial params, which sets the intial guess or fixed value for the parameters which defines the model ##This line sets up the keywords that are accepted by the routine ## pixle_Scale and size should be in arsec/pixel and arcsec respectively. If pixel_scale = 1., then size can be interpreted as size in pixels ## centroid should be set to the center of the image, here assumed to be the middle pixel if (setParams is None): print "Setting parameters to default" initialParams = modPro.default_ModelParameter_Dictionary() else: print "Updating initial parameters with set Params" initialParams = modPro.default_ModelParameter_Dictionary() modPro.update_Dictionary(initialParams, setParams) ## Deprecated initialParams.update(setParams) modPro.set_modelParameter(initialParams, iParams.keys(), iParams.values()) ## Define modelParams modelParams = deepcopy(initialParams) ## Estimate Noise of Image if (calcNoise is not None): #Assumes each image is flattened and therefore needs to be reshaped. if (len(image.shape) == 2): if (image.shape[0] < 2): #Use only the first image tImage = image[0].reshape(modelParams['stamp_size']) maskCentroid = modelParams['centroid'] else: #Use an alternate stack of closest to even (assumes that pixel error is roughly symmetric), (the alternative stack should negate any feature and background, the effect on the noise is uncertain). Can only be used on multiple realisations of the same field if (image.shape[0] % 2 == 0): finalIndex = image.shape[0] else: finalIndex = image.shape[0] - 1 print "Final Index check (should be even): ", finalIndex aStackImage = np.zeros(image[0].shape) for i in range(finalIndex): aStackImage += image[i] #*np.power(-1, i) print "\nEstimating noise from stack-subtracted image" aStackImage /= float(finalIndex) tImage = (image[0] - aStackImage).reshape( modelParams['stamp_size']) #Turn off centroid masking (as feature should be removed), subtract stacked from each realisation, and flatten for noise estimation maskCentroid = None aStackImage = np.tile(aStackImage, (image.shape[0], 1)) tImage = (image - aStackImage).flatten() print "--Done" #-- Note, this could be improved by removing maskCentroid in this case, thus allowing the flattened array to be used (a larger data vector), and thus reducing the noise on the error estimation ##Plot # import pylab as pl # f = pl.figure() # ax = f.add_subplot(111) # im = ax.imshow(tImage) # pl.colorbar(im) # pl.show() elif (len(image.shape) == 1): tImage = image.reshape(modelParams['stamp_size']) maskCentroid = modelParams['centroid'] else: raise ValueError( "find_ML_Estimate: calcNoise: image not of expected shape") modelParams['noise'] = calcNoise(tImage, maskCentroid) ####### Search lnL for minimum #Construct initial guess for free parameters by removing them from dictionary x0 = modPro.unpack_Dictionary(modelParams, requested_keys=fitParams) ###### Sanity check image dimensions compared to model parameters imDim = len(image.shape) if (imDim > 2): raise ValueError( "find_ML_Estimator: Image must not have more than two dimensions. Single postage stamp image must be flattened" ) elif (imDim == 1 and image.shape[0] != np.array(modelParams['stamp_size']).prod()): raise ValueError( "find_ML_Estimator: Flattened image (1D) length does not correspond to model parameter dimensions" ) elif (imDim == 2 and image.shape[1] != np.array(modelParams['stamp_size']).prod()): print 'Image shape: ', image.shape, ' Model shape:', modelParams[ 'stamp_size'] raise ValueError( "find_ML_Estimator: image sahpe of second dimension is not consistent with expected model parameter dimension. 2D image array must contain multiple images across first dimension, and (flattened) pixels as a data vector in the second dimension: Have you remembered to flatten the image?" ) if (preSearchMethod is not None): ## Conduct a presearch of the parameter space to set initial guess (usually grid-based or brute-force) if (vverbose or debug): print '\n Conducting a pre-search of parameter space to idenitfy global minima' if (preSearchMethod.lower() == 'grid' or preSearchMethod.lower() == 'brute'): ##Brute force method over a range either set as the prior, or the input range. if (bruteRange is not None): if (vverbose or debug): print '\n Using user-defined parameter range:', bruteRange print "Using bruteRange: ", bruteRange #x0, fval, bruteGrid, bruteVal bruteOut = opt.brute(get_logLikelihood, ranges=bruteRange, args=(fitParams, image, modelParams, modelLookup, 'sum'), finish=None, full_output=True) x0, fval, bruteGrid, bruteVal = bruteOut ## x0 has len(nParam); fval is scalar; bruteGrid has len(nParam), nGrid*nParam; bruteVal has nGrid*nParam ###Evaluate error based on brute by integration - this would only work if bruteRange cover the full range where the PDF is non-zero if (error is not None and error.lower() == 'brute'): raise RuntimeError( 'find_ML_Estimator - brute labelled as means of evaluating error. This is possbible, but not coded as limitation in use of bruteRange to cover the whole region where the likelihood is non-zero. When a prior is included, this could be taken to be exact, provided one knows the range where the prior has compact support, and the bruteRange reflects this.' ) ## use scipy.integrate.trapz(bruteVal, x = bruteGrid[i], axis = i) with i looping over all parameters (ensure axis set properly... ##Testing of error determination # tErr = fisher_Error_ML(x0, fitParams, image, modelParams, modelLookup) # from scipy.stats import norm # rv = norm(loc = x0, scale = tErr) # ##Plot this # import pylab as pl # f = pl.figure() # ax = f.add_subplot(111) # import math # ax.plot(bruteGrid, np.exp(-1.*(bruteVal-np.amin(bruteVal))), bruteGrid, (np.sqrt(2*math.pi)*tErr)*rv.pdf(bruteGrid)) # pl.show() # raw_input("Check") if (vverbose or debug): print '\n preSearch has found a minimum (on a coarse grid) of:', x0 elif (Prior is not None): if (vverbose or debug): print '\n Using prior range' raise RuntimeError( 'find_ML_Estimator - Prior entry has not yet been coded up' ) else: raise RuntimeError( 'find_ML_Estimator - Brute preSearch is active, but prior or range is not set' ) if (debug or vverbose): ##Output Model Dictionary and initial guess information print 'Model Dictionary:', modelParams print '\n Initial Guess:', x0 ##Find minimum chi^2 using scipy optimize routines ##version 11+ maxima = opt.minimize(get_logLikelihood, x0, args = (fitParams, image, modelParams)) if (searchMethod.lower() == 'simplex'): maxima = opt.fmin(get_logLikelihood, x0=x0, xtol=0.00001, args=(fitParams, image, modelParams, modelLookup, 'sum'), disp=(verbose or debug)) elif (searchMethod.lower() == "emcee"): import emcee if (verbose): print "\n-Running emcee....." #Define MCMC parameters. These should be passed in nWalkers = 6 nRun = 1000 nBurn = 100 if (not isinstance(x0, np.ndarray)): x0 = np.array(x0) nDim = x0.shape[0] print "x0: ", x0 #Produce a new x0 for each parameter. For now, take as -1.5x0 to 1.5x0. Better to pass this in, or inform from prior range p0 = np.zeros((nWalkers, nDim)) for i in range(x0.shape[0]): p0[:, i] = np.random.uniform(-1.5 * x0[i], 1.5 * x0[i], nWalkers) print "P0:", p0 sampler = emcee.EnsembleSampler(nWalkers, nDim, get_logLikelihood, args=(fitParams, image, modelParams, modelLookup, 'sum', -1)) #Burn-in if (verbose): print "-Running burn-in....." pos, prob, state = sampler.run_mcmc(p0, nBurn) sampler.reset() if (verbose): print "--Finished burn-in." print " Position is ", pos print "with prob: ", prob #Run if (verbose): print "-Sampling....." sampler.run_mcmc(pos, nRun) if (verbose): print "--Finished", nRun, " samples." #Get output chain = sampler.flatchain pChain = sampler.flatlnprobability maxIndex = np.argmax(pChain, axis=0) maxima = chain[maxIndex, :] err = np.std(chain, axis=0) if (debug): import pylab as pl f = pl.figure() for i in range(1, nDim + 1): ax = f.add_subplot(nDim, 1, i) ax.hist(chain[:, i - 1], bins=100) ax.set_title("Par: " + fitParams[i - 1]) pl.show() elif (searchMethod.lower() == 'brent'): maxima = opt.fmin_brent(get_logLikelihood, x0=x0, xtol=0.00001, args=(fitParams, image, modelParams, modelLookup, 'sum'), disp=(verbose or debug)) elif (searchMethod.lower() == 'powell'): maxima = opt.fmin_powell(get_logLikelihood, x0=x0, xtol=0.00001, args=(fitParams, image, modelParams, modelLookup, 'sum'), disp=(verbose or debug)) elif (searchMethod.lower() == 'cg'): ##Not tested (10Aug) maxima = opt.fmin_cg( get_logLikelihood, x0=x0, fprime=differentiate_logLikelihood_Gaussian_Analytic, args=(fitParams, image, modelParams, modelLookup, 'sum'), disp=(verbose or debug), ftol=0.000001) elif (searchMethod.lower() == 'bfgs'): ##Not tested (10Aug) maxima = opt.fmin_bfgs( get_logLikelihood, x0=x0, fprime=differentiate_logLikelihood_Gaussian_Analytic, args=(fitParams, image, modelParams, modelLookup, 'sum'), disp=(verbose or debug)) elif (searchMethod.lower() == 'l_bfgs_b'): ##Not tested (10Aug) maxima = opt.fmin_l_bfgs_b( get_logLikelihood, x0=x0, fprime=differentiate_logLikelihood_Gaussian_Analytic, args=(fitParams, image, modelParams, modelLookup, 'sum'), disp=(verbose or debug)) elif (searchMethod.lower() == 'ncg'): ##Not tested (10Aug) maxima = opt.fmin_ncg( get_logLikelihood, x0=x0, fprime=differentiate_logLikelihood_Gaussian_Analytic, args=(fitParams, image, modelParams, modelLookup, 'sum'), disp=(verbose or debug)) else: raise ValueError( 'find_ML_Estimator - searchMethod entered is not supported:' + str(searchMethod)) ##Make numpy array (in the case where 1D is used and scalar is returned): if (len(fitParams) == 1): maxima = np.array(makeIterableList(maxima)) if (vverbose): print 'maxima is:', maxima if (debug): ##Plot and output residual print 'Plotting residual..' fittedParams = deepcopy(modelParams) modPro.set_modelParameter(fittedParams, fitParams, maxima) ''' Deprecated for i in range(len(fitParams)): fittedParams[fitParams[i]] = maxima[i] ''' model, disc = modPro.user_get_Pixelised_Model( fittedParams, sbProfileFunc=gaussian_SBProfile_CXX) residual = image if (len(image.shape) == 2): residual -= image elif (len(image.shape) == 3): for i in range(image.shape[0]): residual[i] -= image[i] else: raise ValueError( "Error calculating residual: Image has an unknown rank") import pylab as pl ##Plot image and model f = pl.figure() ax = f.add_subplot(211) ax.set_title('Model') im = ax.imshow(model, interpolation='nearest') pl.colorbar(im) ax = f.add_subplot(212) ax.set_title('Image') if (len(image.shape) == 3): im = ax.imshow(image[0], interpolation='nearest') else: im = ax.imshow(image, interpolation='nearest') pl.colorbar(im) pl.show() ##Plot Residual f = pl.figure() ax = f.add_subplot(111) im = ax.imshow(residual, interpolation='nearest') ax.set_title('Image-Model') pl.colorbar(im) pl.show() if (np.isnan(maxima).sum() > 0): raise ValueError('get_ML_estimator - FATAL - NaNs found in maxima:', maxima) if (verbose): print 'Maxima found to be:', maxima ##Output Result if (outputHandle is not None): np.savetxt(outputHandle, np.array(maxima).reshape(1, maxima.shape[0])) ## Bias Correct if (biasCorrect == 0): Returned.append(maxima) elif (biasCorrect == 1): ana = mBias.analytic_GaussianLikelihood_Bias(maxima, fitParams, modelParams, order=biasCorrect, diffType='analytic') bc_maxima = maxima - ana ##Output Result if (bcoutputHandle is not None): np.savetxt(bcoutputHandle, np.array(bc_maxima).reshape(1, bc_maxima.shape[0])) if (verbose): print 'BC Maxima found to be:', bc_maxima ##Return minimised parameters Returned.append(maxima, bc_maxima) else: raise ValueError( 'get_ML_estimator - biasCorrect(ion) value entered is not applicable:' + str(biasCorrect)) ## Get Error on measurement. Brute error would have been constructed on the original brute force grid evaluation above. if (error is not None): if (err is not None): err = err #Do nothing elif (error.lower() == 'fisher'): err = fisher_Error_ML( maxima, fitParams, image, modelParams, modelLookup) #Use finalised modelParams here? else: raise ValueError( "get_ML_estimator - failed to return error, error requested, but value not found nor acceptable lable used" ) Returned.append(err) return Returned
def get_inverse_hvp_cg(model, y, v, data_set, method='Basic', **kwargs): # Calculate inverse hessian vector product over the training set using CG method # return x, which is the solution of QP, whose value is H^-1 v # model: neural network model (e.g. model) # y: scalar function output of the neural network (e.g. model.loss) # v: vector to be producted by inverse hessian (i.e.H^-1 v) (e.g. v_test) # data_set: training set to be summed in Hessian # method: Basic-> Conjugate Gradient, Newton -> Newton-Conjugate Gradient # kwargs: hyperparameters for conjugate gradient # hyperparameters batch_size = kwargs.pop('batch_size', 128) #batch_size = kwargs.pop('batch_size', 1) # remark) changing the size of batch can induce randomness of output # due to precision loss and parallel computing damping = kwargs.pop('damping', 0.0) avextol = kwargs.pop('avextol', 1e-8) maxiter = kwargs.pop('maxiter', 1e2) num_workers = kwargs.pop('num_workers', 6) get_inverse_hvp_cg.dl = DataLoader(data_set, batch_size, shuffle=False, num_workers=num_workers) get_inverse_hvp_cg.damp = damping get_inverse_hvp_cg.cnt = 0 get_inverse_hvp_cg.fmt = {key: val.shape for (key, val) in v.items()} get_inverse_hvp_cg.temp_hvp = dic2vec(v) # temporal hvp for callback t0 = time.time() def HVP_minibatch_val(y, v): # Calculate Hessian vector product w.r.t whole dataset # y: scalar function output of the neural network (e.g. model.loss) # v: vector to be producted by inverse hessian (i.e.H^-1 v) (numeric dictionary, e.g. v_test) ## model: neural network model (e.g. model) ## dataloader: dataloader for the training set ## damping: damp term to make hessian convex num_data = data_set.__len__() hvp_batch = {key: np.zeros_like(value) for key,value in v.items()} for img, lb in get_inverse_hvp_cg.dl: img = img.numpy(); lb = lb.numpy() x_feed = {model.X: img, model.y:lb} hvp = HVP(y,x_feed,v) # add hvp value for ks in hvp.keys(): hvp_batch[ks] += hvp[ks] # gradient will do batch-wise summation # normalize after the summation to reduce precision loss hvp_batch = {key: val/num_data for (key,val) in hvp_batch.items()} # damping term for ks in hvp.keys(): hvp_batch[ks] += get_inverse_hvp_cg.damp * v[ks] # update after evaluation get_inverse_hvp_cg.temp_hvp = dic2vec(hvp_batch) return hvp_batch def get_fmin_loss_fn(y, v): def fmin_loss_fn(x): x_dic = vec2dic(x, get_inverse_hvp_cg.fmt) hvp_val = HVP_minibatch_val(y, x_dic) return 0.5 * grad_inner_product(hvp_val, x_dic) - grad_inner_product(v, x_dic) return fmin_loss_fn def get_fmin_grad_fn(y, v): def fmin_grad_fn(x): # x: 1D vector x_dic = vec2dic(x, get_inverse_hvp_cg.fmt) hvp_val = HVP_minibatch_val(y, x_dic) hvp_flat = dic2vec(hvp_val) v_flat = dic2vec(v) return hvp_flat - v_flat return fmin_grad_fn def get_fmin_hvp_fn(y, v): def fmin_hvp_fn(x, p): p_dic = vec2dic(p, get_inverse_hvp_cg.fmt) hvp_val = HVP_minibatch_val(y, p_dic) hvp_flat = dic2vec(hvp_val) return hvp_flat return fmin_hvp_fn def get_cg_callback(v, t0): def cg_callback(x): print('iteration: {}'.format(get_inverse_hvp_cg.cnt), ', ', time.time()-t0, '(sec) elapsed') print('vector element-wise square: ', np.inner(x, x)) grad_prev = get_inverse_hvp_cg.temp_hvp-dic2vec(v) # previous gradient value which should be 0 print('temporal gradient value: ', np.inner(grad_prev,grad_prev)) ambiguous_loss = 1/2* np.inner(get_inverse_hvp_cg.temp_hvp,x) - np.inner(x, dic2vec(v)) print('temporal function value(ambiguous): ', ambiguous_loss) get_inverse_hvp_cg.cnt += 1 return 0 return cg_callback fmin_loss_fn = get_fmin_loss_fn(y, v) fmin_grad_fn = get_fmin_grad_fn(y, v) fmin_hvp_fn = get_fmin_hvp_fn(y, v) cg_callback = get_cg_callback(v, t0) if method == 'Newton': fmin_results = fmin_ncg(\ f = fmin_loss_fn, x0 = dic2vec(v), fprime = fmin_grad_fn,\ fhess_p = fmin_hvp_fn, avextol = avextol, maxiter = maxiter, callback=cg_callback) else: fmin_results = fmin_cg(\ f = fmin_loss_fn, x0 = dic2vec(v), fprime = fmin_grad_fn,\ maxiter = maxiter, callback = cg_callback) return vec2dic(fmin_results, get_inverse_hvp_cg.fmt)
#A = io.mmread('bcsstk06.mtx.gz') # clustered eigenvalues #B = io.mmread('bcsstm06.mtx.gz') n = A.shape[0] B = speye(n, n) random.seed(1) v_0 = random.rand(n) print("try fmin_bfgs") full_output = 1 data = [] v,fopt, gopt, Hopt, func_calls, grad_calls, warnflag, allvecs = \ optimize.fmin_bfgs(R,v_0,fprime=Rp,full_output=full_output,retall=1) if warnflag == 0: plt.semilogy(np.arange(0, len(data)), data) print('Rayleigh quotient BFGS', R(v)) print("fmin_bfgs OK") print("try fmin_ncg") # # WARNING: the program may hangs if fmin_ncg is used # data = [] v,fopt, fcalls, gcalls, hcalls, warnflag, allvecs = \ optimize.fmin_ncg(R,v_0,fprime=Rp,fhess=Rpp,full_output=full_output,retall=1) if warnflag == 0: plt.figure() plt.semilogy(np.arange(0, len(data)), data) print('Rayleigh quotient NCG', R(v))
cost = costFunction(initial_theta, X, y) grad = gradient(initial_theta, X, y) print('Cost at initial theta (zeros): ', cost) print('Gradient at initial theta (zeros): ') print(grad) ## ============= Part 3: Optimizing using advance optimization problem ============= # In this exercise, you will use a built-in function to find the # optimal parameters theta. import scipy.optimize as op # Run Newton-Conjugate-Gradient to obtain the optimal theta theta = op.fmin_ncg(f=costFunction, x0=initial_theta, fprime=gradient, args=(X, y)) cost = costFunction(theta, X, y) # Print theta to screen print('Cost at theta found by fminunc: ', cost) print('theta: ') print(theta) # Plot Boundary plotDecisionBoundary(theta, X, y) ## ============== Part 4: Predict and Accuracies ============== # After learning the parameters, you'll like to use it to predict the outcomes # on unseen data. In this part, you will use the logistic regression model # to predict the probability that a student with score 45 on exam 1 and
def optimize_J_reg(theta, X, y, L): return fmin_ncg(costFunctionReg, x0=theta, fprime=gradientReg, args=(X, y, L), maxiter=400)
def fit(self, X, y): labels = set(y) #conjunto de clases n_labels = len(labels) #número de etiquetas o de clases #creamos un encoder para el mapeo de etiquetas # que podrían ser de tipo distinto a #flotante a valores de tipo flotante de 0 ... n_labels encoder = dict(zip(labels, np.arange(float(n_labels)))) #Mapeo inverso al anterior self.decoder = dict(zip(np.arange(float(n_labels)), labels)) #Número de dimensiones o de características n = X.shape[1] #Número de ejemplos m = X.shape[0] #Agregamos un uno a cada ejemplo para tomar en cuenta el término de bias X_aux = np.concatenate((np.ones((m, 1), dtype=np.float64), X), axis=1) #Valor inicial del vector de theta para el optimizador initial_theta = np.zeros((n + 1, 1), dtype=np.float64) #Vector de parámetros del modelo a entrenar theta = np.zeros((n + 1, 1), dtype=np.float64) #Vector de etiquetas codificadas a valores reales para poder #utilizarlas en el cálculo de costos y gradiente y_enc = np.array(list(map(lambda x: encoder[x], y))) #Lista de argumentos que se le pasará a las funciones del optimizador args = [X_aux, y_enc, self.C] #Si es un problema de clasificación multiclase if n_labels > 2: #Creamos una matriz donde se va a depositar un vector de parámetros por cada modelo #correspondiente a cada etiqueta self.all_theta = np.zeros((n_labels, n + 1), dtype=np.float64) #Si no se quiere ejecutar en paralelo if self.n_jobs is None: #Para cada clase for c in range(n_labels): #Asignar uno a todos los ejemplos de la clase c y 0 a todos los demás args[1] = np.array(list( map(lambda x: 1.0 if x == c else 0.0, y_enc)), dtype=np.float64) #Utilizamos el optimizador elegido, y le pasamos la funcion de costo, la theta inicial, el gradiente de la función de costo, y la lista de argumentos #que se le pasará tanto a la función de costo como al gradiente if self.solver == 'fmincg': theta = optimize.fmin_cg(self.cost_func, initial_theta, fprime=self.grad_cost_func, args=args, maxiter=self.max_iter) elif self.solver == 'newton-cg': theta = optimize.fmin_ncg(self.cost_func, initial_theta, fprime=self.grad_cost_func, args=args, maxiter=self.max_iter) elif self.solver == 'lbfgs': theta = optimize.fmin_l_bfgs_b( self.cost_func, initial_theta, fprime=self.grad_cost_func, args=args, maxiter=self.max_iter)[0] #print(theta) #Guardamos el vector del modelo entrenado para la clase c #trasponemos el vector columna para colocarlo en su renglón correspondiente #en la matriz de vectores de modelo self.all_theta[c, :] = theta.transpose() else: #Si se quiere ejecutar en paralelo #Diccionario de 'y', para cada clase vamos a modificar #los valores de y para que sean uno para los ejemplos de la clase #y cero para todos los demás, si no depositamos en un diccionario #estos vectores, varios threads van querer modificar al mismo tiempo #los valores de 'y', llevando a corrupción de los datos y_p = {} #Misma lógica que para los valores de 'y', cada thread va a entrenar #y por lo tanto a modificar los valores de theta thetas = {} X_p = {} #Creamos un threadpool para evitar crear y destruir constantemente threads #y tan sólo crear un conjunto fijo de threads al inicio de la ejecución with concurrent.futures.ThreadPoolExecutor( max_workers=self.n_jobs) as executor: #Vamos a ejecutar el for de manera concurrente con un límite de concurrencia de n_jobs #cada thread ejecuta una iteración del for, cualquier bloque de código que se coloque #dentro del for se ejecuta de manera serial en el contexto del thread que lo esté ejecutando for c in range(n_labels): #Convertimos los valores de 'y' y los asignamos al diccionario y_p en su llave correspondiente future = executor.submit(self.func2, y_p, c, y_enc, X_p, X_aux) #Ejecutamos el entrenamiento del modelo para la clase c future = executor.submit(self.func, thetas, self.max_iter, n, c, X_p, y_p, self.C) #Al terminar de entrenar cada modelo, asignamos cada vector de cada modelo a su renglón correspondiente #dentro de la matriz de modelos for c in range(n_labels): self.all_theta[c, :] = thetas[c] #Para la regresión logística binaria else: #Tenemos un solo modelo, que nos servirá para determinar los ejemplos que son de una clase # y los que no lo son (y evidentemente pertenecen a la otra clase) self.all_theta = np.zeros((1, n + 1), dtype=np.float64) if self.solver == 'fmincg': theta = optimize.fmin_cg(self.cost_func, initial_theta, fprime=self.grad_cost_func, args=args, maxiter=self.max_iter) elif self.solver == 'newton-cg': theta = optimize.fmin_ncg(self.cost_func, initial_theta, fprime=self.grad_cost_func, args=args, maxiter=self.max_iter) elif self.solver == 'lbfgs': theta = optimize.fmin_l_bfgs_b(self.cost_func, initial_theta, fprime=self.grad_cost_func, args=args, maxiter=self.max_iter)[0] #print(theta) self.all_theta = theta.transpose()
def fit(self, method='fmin_powell', iterlim=1000, tol=.0001, verbose=0, no_callback=False, **kwargs): """ N.fit(method='fmin_powell', iterlim=1000, tol=.001): Causes the normal approximation object to fit itself. method: May be one of the following, from the scipy.optimize package: -fmin_l_bfgs_b -fmin_ncg -fmin_cg -fmin_powell -fmin no_callback: Boolean indicating whether or not to use a callback function. If True and a callback keyword is provided in kwargs, then the user-supplied callback will be used. Otherwise, if False, and verbose > 0, a default callback will print iteration progress. The kwargs are passed to the scipy.optimize functions. See there for more information. """ self.tol = tol self.method = method self.verbose = verbose p = zeros(self.len, dtype=float) for stochastic in self.stochastics: p[self._slices[stochastic]] = ravel(stochastic.value) if not self.method == 'newton': if not scipy_imported: raise ImportError('Scipy is required to use EM and NormApprox') default_callback = (verbose > 0 and not no_callback) if default_callback and 'callback' in kwargs: raise ValueError("For user-provided callback and verbose output" " set use_callback to True") if default_callback: def callback(p): try: print_('Current log-probability : %f' % self.logp) except ZeroProbability: print_('Current log-probability : %f' % -Inf) elif 'callback' in kwargs: callback = kwargs.pop('callback') else: def callback(p): pass if self.method == 'fmin_ncg': p = fmin_ncg(f=self.func, x0=p, fprime=self.gradfunc, fhess=self.hessfunc, epsilon=self.eps, maxiter=iterlim, callback=callback, avextol=tol, disp=verbose, **kwargs) elif self.method == 'fmin': p = fmin(func=self.func, x0=p, callback=callback, maxiter=iterlim, ftol=tol, disp=verbose, **kwargs) elif self.method == 'fmin_powell': p = fmin_powell(func=self.func, x0=p, callback=callback, maxiter=iterlim, ftol=tol, disp=verbose, **kwargs) elif self.method == 'fmin_cg': p = fmin_cg(f=self.func, x0=p, fprime=self.gradfunc, epsilon=self.eps, callback=callback, maxiter=iterlim, gtol=tol, disp=verbose, **kwargs) elif self.method == 'fmin_l_bfgs_b': from scipy import __version__ as sp_version from distutils.version import LooseVersion if LooseVersion(sp_version) >= LooseVersion('0.12.0'): p = fmin_l_bfgs_b(func=self.func, x0=p, fprime=self.gradfunc, epsilon=self.eps, callback=callback, pgtol=tol, iprint=verbose - 1, **kwargs)[0] else: if verbose > 0: from warnings import warn warn( "Callbacks are not available for fmin_l_bfgs_b in " "SciPy < 0.12.0. Optimization progress will not be" "displayed.", UserWarning) p = fmin_l_bfgs_b(func=self.func, x0=p, fprime=self.gradfunc, epsilon=self.eps, pgtol=tol, iprint=verbose - 1, **kwargs)[0] else: raise ValueError('Method unknown.') self._set_stochastics(p) self._mu = p try: self.logp_at_max = self.logp except: raise RuntimeError( 'Posterior probability optimization converged to value with zero probability.' ) lnL = sum([x.logp for x in self.observed_stochastics ]) # log-likelihood of observed stochastics self.lnL = lnL self.AIC = 2. * (self.len - lnL) # 2k - 2 ln(L) self.AICc = self.AIC + ( (2 * self.len * (self.len + 1)) / float(self.data_len - self.len - 1)) try: self.BIC = self.len * log( self.data_len) - 2. * lnL # k ln(n) - 2 ln(L) except FloatingPointError: self.BIC = -Inf self.fitted = True
cost, grad = costFunctionReg(initial_theta, X, y, lambda_val) print('Cost at initial theta (zeros): %f' % cost) raw_input('Program paused. Press enter to continue') # =================== Part 2: Regularization and Accuracies =================== initial_theta = np.zeros((X.shape[1], 1)) lambda_val = 1 fmin_ret = fmin_ncg(lambda t: (costFunctionReg(t, X, y, lambda_val)[0]), initial_theta, lambda t: (costFunctionReg(t, X, y, lambda_val)[1]), maxiter=400, full_output=True) theta = fmin_ret[0] cost = fmin_ret[1] print('Cost at theta found by fmin: %f' % cost) print('theta:') print(theta) plotDecisionBoundary(theta, X, y) plt.title('lambda = %d' % lambda_val) plt.legend() p = predict(theta, X)
x_bfgs = optimize.fmin_bfgs(f, K[0], disp=0)[0] print(' BFGS: time %.2fs, x error %.2f, f error %.2f' % (time.time() - t0, np.sqrt(np.sum( (x_bfgs - x_ref)**2)), f(x_bfgs) - f_ref)) t0 = time.time() x_l_bfgs = optimize.fmin_l_bfgs_b(f, K[0], approx_grad=1, disp=0)[0] print(' L-BFGS: time %.2fs, x error %.2f, f error %.2f' % (time.time() - t0, np.sqrt(np.sum( (x_l_bfgs - x_ref)**2)), f(x_l_bfgs) - f_ref)) t0 = time.time() x_bfgs = optimize.fmin_bfgs(f, K[0], f_prime, disp=0)[0] print(" BFGS w f': time %.2fs, x error %.2f, f error %.2f" % (time.time() - t0, np.sqrt(np.sum( (x_bfgs - x_ref)**2)), f(x_bfgs) - f_ref)) t0 = time.time() x_l_bfgs = optimize.fmin_l_bfgs_b(f, K[0], f_prime, disp=0)[0] print("L-BFGS w f': time %.2fs, x error %.2f, f error %.2f" % (time.time() - t0, np.sqrt(np.sum( (x_l_bfgs - x_ref)**2)), f(x_l_bfgs) - f_ref)) t0 = time.time() x_newton = optimize.fmin_ncg(f, K[0], f_prime, fhess=hessian, disp=0)[0] print(" Newton: time %.2fs, x error %.2f, f error %.2f" % (time.time() - t0, np.sqrt(np.sum( (x_newton - x_ref)**2)), f(x_newton) - f_ref)) pl.show()
print('Cost at initial theta (zeros): ', cost) ## ============= Part 2: Regularization and Accuracies ============= # Optional Exercise: # In this part, you will get to try different values of lambda and # see how regularization affects the decision coundart # # Try the following values of lambda (0, 1, 10, 100). # # How does the decision boundary change when you vary lambda? How does # the training set accuracy vary? # import scipy.optimize as op # Run Newton-Conjugate-Gradient to obtain the optimal theta theta = op.fmin_ncg(f=costFunctionReg, x0=initial_theta, fprime=gradientReg, args=(X, y, lambd)) cost = costFunctionReg(theta, X, y, lambd) # Plot Boundary plotDecisionBoundary(theta, X, y) # Compute accuracy on our training set p = predict(theta, X); acc = (np.sum(p == y) * 100.0)/m print('Train Accuracy: ', acc)
def minimize(func, x0, gradient=None, hessian=None, algorithm="default", **args): r""" This function is an interface to a variety of algorithms for computing the minimum of a function of several variables. INPUT: - ``func`` -- Either a symbolic function or a Python function whose argument is a tuple with `n` components - ``x0`` -- Initial point for finding minimum. - ``gradient`` -- Optional gradient function. This will be computed automatically for symbolic functions. For Python functions, it allows the use of algorithms requiring derivatives. It should accept a tuple of arguments and return a NumPy array containing the partial derivatives at that point. - ``hessian`` -- Optional hessian function. This will be computed automatically for symbolic functions. For Python functions, it allows the use of algorithms requiring derivatives. It should accept a tuple of arguments and return a NumPy array containing the second partial derivatives of the function. - ``algorithm`` -- String specifying algorithm to use. Options are ``'default'`` (for Python functions, the simplex method is the default) (for symbolic functions bfgs is the default): - ``'simplex'`` - ``'powell'`` - ``'bfgs'`` -- (Broyden-Fletcher-Goldfarb-Shanno) requires ``gradient`` - ``'cg'`` -- (conjugate-gradient) requires gradient - ``'ncg'`` -- (newton-conjugate gradient) requires gradient and hessian EXAMPLES:: sage: vars=var('x y z') sage: f=100*(y-x^2)^2+(1-x)^2+100*(z-y^2)^2+(1-y)^2 sage: minimize(f,[.1,.3,.4],disp=0) (1.00..., 1.00..., 1.00...) sage: minimize(f,[.1,.3,.4],algorithm="ncg",disp=0) (0.9999999..., 0.999999..., 0.999999...) Same example with just Python functions:: sage: def rosen(x): # The Rosenbrock function ... return sum(100.0r*(x[1r:]-x[:-1r]**2.0r)**2.0r + (1r-x[:-1r])**2.0r) sage: minimize(rosen,[.1,.3,.4],disp=0) (1.00..., 1.00..., 1.00...) Same example with a pure Python function and a Python function to compute the gradient:: sage: def rosen(x): # The Rosenbrock function ... return sum(100.0r*(x[1r:]-x[:-1r]**2.0r)**2.0r + (1r-x[:-1r])**2.0r) sage: import numpy sage: from numpy import zeros sage: def rosen_der(x): ... xm = x[1r:-1r] ... xm_m1 = x[:-2r] ... xm_p1 = x[2r:] ... der = zeros(x.shape,dtype=float) ... der[1r:-1r] = 200r*(xm-xm_m1**2r) - 400r*(xm_p1 - xm**2r)*xm - 2r*(1r-xm) ... der[0] = -400r*x[0r]*(x[1r]-x[0r]**2r) - 2r*(1r-x[0]) ... der[-1] = 200r*(x[-1r]-x[-2r]**2r) ... return der sage: minimize(rosen,[.1,.3,.4],gradient=rosen_der,algorithm="bfgs",disp=0) (1.00..., 1.00..., 1.00...) """ from sage.symbolic.expression import Expression from sage.ext.fast_eval import fast_callable import scipy from scipy import optimize if isinstance(func, Expression): var_list = func.variables() var_names = map(str, var_list) fast_f = fast_callable(func, vars=var_names, domain=float) f = lambda p: fast_f(*p) gradient_list = func.gradient() fast_gradient_functions = [ fast_callable(gradient_list[i], vars=var_names, domain=float) for i in xrange(len(gradient_list)) ] gradient = lambda p: scipy.array( [a(*p) for a in fast_gradient_functions]) else: f = func if algorithm == "default": if gradient == None: min = optimize.fmin(f, map(float, x0), **args) else: min = optimize.fmin_bfgs(f, map(float, x0), fprime=gradient, **args) else: if algorithm == "simplex": min = optimize.fmin(f, map(float, x0), **args) elif algorithm == "bfgs": min = optimize.fmin_bfgs(f, map(float, x0), fprime=gradient, **args) elif algorithm == "cg": min = optimize.fmin_cg(f, map(float, x0), fprime=gradient, **args) elif algorithm == "powell": min = optimize.fmin_powell(f, map(float, x0), **args) elif algorithm == "ncg": if isinstance(func, Expression): hess = func.hessian() hess_fast = [[ fast_callable(a, vars=var_names, domain=float) for a in row ] for row in hess] hessian = lambda p: [[a(*p) for a in row] for row in hess_fast] hessian_p = lambda p, v: scipy.dot(scipy.array(hessian(p)), v) min = optimize.fmin_ncg(f, map(float, x0), fprime=gradient, fhess=hessian, fhess_p=hessian_p, **args) return vector(RDF, min)
def fit(self, method='fmin', iterlim=1000, tol=.0001, verbose=0): """ N.fit(method='fmin', iterlim=1000, tol=.001): Causes the normal approximation object to fit itself. method: May be one of the following, from the scipy.optimize package: -fmin_l_bfgs_b -fmin_ncg -fmin_cg -fmin_powell -fmin """ self.tol = tol self.method = method self.verbose = verbose p = zeros(self.len, dtype=float) for stochastic in self.stochastics: p[self._slices[stochastic]] = ravel(stochastic.value) if not self.method == 'newton': if not scipy_imported: raise ImportError('Scipy is required to use EM and NormApprox') if self.verbose > 0: def callback(p): try: print_('Current log-probability : %f' % self.logp) except ZeroProbability: print_('Current log-probability : %f' % -Inf) else: def callback(p): pass if self.method == 'fmin_ncg': p = fmin_ncg(f=self.func, x0=p, fprime=self.gradfunc, fhess=self.hessfunc, epsilon=self.eps, maxiter=iterlim, callback=callback, avextol=tol, disp=verbose) elif self.method == 'fmin': p = fmin(func=self.func, x0=p, callback=callback, maxiter=iterlim, ftol=tol, disp=verbose) elif self.method == 'fmin_powell': p = fmin_powell(func=self.func, x0=p, callback=callback, maxiter=iterlim, ftol=tol, disp=verbose) elif self.method == 'fmin_cg': p = fmin_cg(f=self.func, x0=p, fprime=self.gradfunc, epsilon=self.eps, callback=callback, maxiter=iterlim, gtol=tol, disp=verbose) elif self.method == 'fmin_l_bfgs_b': p = fmin_l_bfgs_b( func=self.func, x0=p, fprime=self.gradfunc, epsilon=self.eps, # callback=callback, pgtol=tol, iprint=verbose - 1)[0] else: raise ValueError('Method unknown.') self._set_stochastics(p) self._mu = p try: self.logp_at_max = self.logp except: raise RuntimeError( 'Posterior probability optimization converged to value with zero probability.' ) lnL = sum([x.logp for x in self.observed_stochastics ]) # log-likelihood of observed stochastics self.AIC = 2. * (self.len - lnL) # 2k - 2 ln(L) try: self.BIC = self.len * log( self.data_len) - 2. * lnL # k ln(n) - 2 ln(L) except FloatingPointError: self.BIC = -Inf self.fitted = True