def solve(self, sampler, param_init = None, K = None, resume = None, new_lr = None): """The solve function. Input: sampler: the data sampler. sampler.sample() should return a list of training data, either (X, Y, weight) or (X, Y, None) depending on whether weight is enforced. param_init: the initial parameter. See SolverMC for details. """ mode = self._args.get('mode', 'lbfgs').lower() # even when we use Adagrad we create a solver_basic to deal with # function value and gradient computation, etc. solver_basic = SolverMC(self._gamma, self.loss, self.reg, self._args, self._lossargs, self._regargs, self._fminargs) param = param_init iter_start = 0 if resume is not None: # load data from logging.debug('Resuming from %s' % resume) npzdata = np.load(resume) param = (npzdata['w'], npzdata['b']) iter_start = npzdata['iter'] + 1 if 'accum_grad' in npzdata: accum_grad = npzdata['accum_grad'] if 'base_lr' in npzdata: self._args['base_lr'] = npzdata['base_lr'] if new_lr is not None: self._args['base_lr'] = new_lr timer = util.Timer() for iter in range(iter_start, self._args['num_iter']): Xbatch, Ybatch, weightbatch = sampler.sample(self._args['minibatch']) # carry out the computation if mode == 'lbfgs': accum_grad = None param = solver_basic.solve(Xbatch, Ybatch, weightbatch, param, K = K) logging.debug('iter %d time = %s' % \ (iter, str(timer.total(False)))) else: # adagrad: compute gradient and update if iter == iter_start: logging.debug("Adagrad: Initializing") param_flat = solver_basic.presolve(\ Xbatch, Ybatch, weightbatch, param, K = K) # we need to build the cache in solver_basic as well as # the accumulated gradients if iter == 0: accum_grad = np.ones_like(param_flat) * \ (self._args.get('eta', 0.) ** 2) + \ np.finfo(np.float64).eps if 'base_lr' not in self._args or self._args['base_lr'] < 0: logging.debug("Adagrad: Performing line search") # do a line search to get the value self._args['base_lr'] = \ mathutil.wolfe_line_search_adagrad(param_flat, lambda x: SolverMC.obj(x, solver_basic), alpha = np.abs(self._args.get('base_lr', 1.)), eta = self._args.get('eta', 0.)) # reset the timer to exclude the base learning rate tuning # time timer.reset() else: solver_basic._X = Xbatch solver_basic._Y = Ybatch solver_basic._weight = weightbatch logging.debug("Adagrad: Computing func and grad") f0, g = SolverMC.obj(param_flat, solver_basic) logging.debug('gradient max/min: %f/%f' % (g.max(), g.min())) accum_grad += g * g # we are MINIMIZING, so go against the gradient direction param_flat -= g / np.sqrt(accum_grad) * self._args['base_lr'] # the below code could be used to debug, but is commented out # currently for speed considerations. if False: f = SolverMC.obj(param_flat, solver_basic)[0] logging.debug('iter %d f0 = %f f = %f time = %s' % \ (iter, f0, f,\ str(timer.total(False)))) else: logging.debug('iter %d f0 = %f time = %s' % \ (iter, f0, str(timer.total(False)))) param = solver_basic.unflatten_params(param_flat) callback = self._args.get('callback', None) if callback is None: pass elif type(callback) is not list: cb_val = callback(param) logging.debug('cb: ' + str(cb_val)) else: cb_val = [cb_func(param) for cb_func in callback] logging.debug('cb: ' + ' '.join([str(v) for v in cb_val])) if 'dump_every' in self._args and \ (iter + 1) % self._args['dump_every'] == 0: logging.debug('dumping param...') mpi.root_savez(self._args['dump_name'],\ iter=iter, w = param[0], b = param[1], \ accum_grad = accum_grad, base_lr = self._args['base_lr']) return param
def solve(self, sampler, param_init = None): """The solve function. Input: sampler: the data sampler. sampler.sample() should return a list of training data, either (X, Y, weight) or (X, Y, None) depending on whether weight is enforced. param_init: the initial parameter. See SolverMC for details. """ mode = self._args.get('mode', 'lbfgs').lower() # even when we use Adagrad we create a solver_basic to deal with # function value and gradient computation, etc. solver_basic = SolverMC(self._gamma, self.loss, self.reg, self._args, self._lossargs, self._regargs, self._fminargs) param = param_init timer = util.Timer() for iter in range(self._args['num_iter']): Xbatch, Ybatch, weightbatch = sampler.sample(self._args['minibatch']) # carry out the computation if mode == 'lbfgs': param = solver_basic.solve(Xbatch, Ybatch, weightbatch, param) logging.debug('iter %d time = %s' % \ (iter, str(timer.total(False)))) else: # adagrad: compute gradient and update param_flat = solver_basic.presolve(\ Xbatch, Ybatch, weightbatch, param) if iter == 0: # we need to build the cache in solver_basic as well as # the accumulated gradients accum_grad = np.ones_like(param_flat) * \ (self._args.get('eta', 0.) ** 2) + \ np.finfo(np.float64).eps if self._args.get('base_lr', None) is None: # do a line search to get the value self._args['base_lr'] = \ mathutil.wolfe_line_search_adagrad(param_flat, lambda x: SolverMC.obj(x, solver_basic), eta = self._args.get('eta', 0.)) # reset the timer to exclude the base learning rate tuning # time timer.reset() f0, g = SolverMC.obj(param_flat, solver_basic) accum_grad += g * g # we are MINIMIZING, so go against the gradient direction param_flat -= g / np.sqrt(accum_grad) * self._args['base_lr'] f = SolverMC.obj(param_flat, solver_basic)[0] logging.debug('iter %d f0 = %f f = %f time = %s' % \ (iter, f0, f,\ str(timer.total(False)))) param = solver_basic.unflatten_params(param_flat) callback = self._args.get('callback', None) if callback is None: continue if type(callback) is not list: cb_val = callback(param) logging.debug('cb: ' + str(cb_val)) else: cb_val = [cb_func(param) for cb_func in callback] logging.debug('cb: ' + ' '.join([str(v) for v in cb_val])) # the stochastic part is done. See if we want to do fine-tuning. finetune = self._args.get('fine_tune', 0) if finetune > 0: solver_basic._fminargs['maxfun'] = int(finetune) param = solver_basic.solve(X, Y, weight, param) return param