예제 #1
0
 def solve(self, sampler, param_init = None, K = None,
          resume = None, new_lr = None):
     """The solve function.
     Input:
         sampler: the data sampler. sampler.sample() should return a list
             of training data, either (X, Y, weight) or (X, Y, None)
             depending on whether weight is enforced.
         param_init: the initial parameter. See SolverMC for details.
     """
     mode = self._args.get('mode', 'lbfgs').lower()
     # even when we use Adagrad we create a solver_basic to deal with
     # function value and gradient computation, etc.
     solver_basic = SolverMC(self._gamma, self.loss, self.reg,
             self._args, self._lossargs, self._regargs,
             self._fminargs)
     param = param_init
     iter_start = 0
     if resume is not None:
         # load data from
         logging.debug('Resuming from %s' % resume)
         npzdata = np.load(resume)
         param = (npzdata['w'], npzdata['b'])
         iter_start = npzdata['iter'] + 1
         if 'accum_grad' in npzdata:
             accum_grad = npzdata['accum_grad']
         if 'base_lr' in npzdata:
             self._args['base_lr'] = npzdata['base_lr']
         if new_lr is not None:
             self._args['base_lr'] = new_lr
     timer = util.Timer()
     for iter in range(iter_start, self._args['num_iter']):
         Xbatch, Ybatch, weightbatch = sampler.sample(self._args['minibatch'])
         # carry out the computation
         if mode == 'lbfgs':
             accum_grad = None
             param = solver_basic.solve(Xbatch, Ybatch, weightbatch, param, K = K)
             logging.debug('iter %d time = %s' % \
                     (iter, str(timer.total(False))))
         else:
             # adagrad: compute gradient and update
             if iter == iter_start:
                 logging.debug("Adagrad: Initializing")
                 param_flat = solver_basic.presolve(\
                         Xbatch, Ybatch, weightbatch, param, K = K)
                 # we need to build the cache in solver_basic as well as
                 # the accumulated gradients
                 if iter == 0:
                     accum_grad = np.ones_like(param_flat) * \
                             (self._args.get('eta', 0.) ** 2) + \
                             np.finfo(np.float64).eps
                 if 'base_lr' not in self._args or self._args['base_lr'] < 0:
                     logging.debug("Adagrad: Performing line search")
                     # do a line search to get the value
                     self._args['base_lr'] = \
                             mathutil.wolfe_line_search_adagrad(param_flat,
                             lambda x: SolverMC.obj(x, solver_basic),
                             alpha = np.abs(self._args.get('base_lr', 1.)),
                             eta = self._args.get('eta', 0.))
                     # reset the timer to exclude the base learning rate tuning
                     # time
                     timer.reset()
             else:
                 solver_basic._X = Xbatch
                 solver_basic._Y = Ybatch
                 solver_basic._weight = weightbatch
             logging.debug("Adagrad: Computing func and grad")
             f0, g = SolverMC.obj(param_flat, solver_basic)
             logging.debug('gradient max/min: %f/%f' % (g.max(), g.min()))
             accum_grad += g * g
             # we are MINIMIZING, so go against the gradient direction
             param_flat -= g / np.sqrt(accum_grad) * self._args['base_lr']
             # the below code could be used to debug, but is commented out
             # currently for speed considerations.
             if False:
                 f = SolverMC.obj(param_flat, solver_basic)[0] 
                 logging.debug('iter %d f0 = %f f = %f time = %s' % \
                         (iter, f0, f,\
                         str(timer.total(False))))
             else:
                 logging.debug('iter %d f0 = %f time = %s' % \
                         (iter, f0, str(timer.total(False))))
             param = solver_basic.unflatten_params(param_flat)
         callback = self._args.get('callback', None)
         if callback is None:
             pass
         elif type(callback) is not list:
             cb_val = callback(param)
             logging.debug('cb: ' + str(cb_val))
         else:
             cb_val = [cb_func(param) for cb_func in callback]
             logging.debug('cb: ' + ' '.join([str(v) for v in cb_val]))
         if 'dump_every' in self._args and \
                 (iter + 1) % self._args['dump_every'] == 0:
             logging.debug('dumping param...')
             mpi.root_savez(self._args['dump_name'],\
                     iter=iter, w = param[0], b = param[1], \
                     accum_grad = accum_grad, base_lr = self._args['base_lr'])
     return param
예제 #2
0
 def solve(self, sampler, param_init = None):
     """The solve function.
     Input:
         sampler: the data sampler. sampler.sample() should return a list
             of training data, either (X, Y, weight) or (X, Y, None)
             depending on whether weight is enforced.
         param_init: the initial parameter. See SolverMC for details.
     """
     mode = self._args.get('mode', 'lbfgs').lower()
     # even when we use Adagrad we create a solver_basic to deal with
     # function value and gradient computation, etc.
     solver_basic = SolverMC(self._gamma, self.loss, self.reg,
             self._args, self._lossargs, self._regargs,
             self._fminargs)
     param = param_init
     timer = util.Timer()
     for iter in range(self._args['num_iter']):
         Xbatch, Ybatch, weightbatch = sampler.sample(self._args['minibatch'])
         # carry out the computation
         if mode == 'lbfgs':
             param = solver_basic.solve(Xbatch, Ybatch, weightbatch, param)
             logging.debug('iter %d time = %s' % \
                     (iter, str(timer.total(False))))
         else:
             # adagrad: compute gradient and update
             param_flat = solver_basic.presolve(\
                     Xbatch, Ybatch, weightbatch, param)
             if iter == 0:
                 # we need to build the cache in solver_basic as well as
                 # the accumulated gradients
                 accum_grad = np.ones_like(param_flat) * \
                         (self._args.get('eta', 0.) ** 2) + \
                         np.finfo(np.float64).eps
                 if self._args.get('base_lr', None) is None:
                     # do a line search to get the value
                     self._args['base_lr'] = \
                             mathutil.wolfe_line_search_adagrad(param_flat,
                             lambda x: SolverMC.obj(x, solver_basic),
                             eta = self._args.get('eta', 0.))
                     # reset the timer to exclude the base learning rate tuning
                     # time
                     timer.reset()
             f0, g = SolverMC.obj(param_flat, solver_basic)
             accum_grad += g * g
             # we are MINIMIZING, so go against the gradient direction
             param_flat -= g / np.sqrt(accum_grad) * self._args['base_lr']
             f = SolverMC.obj(param_flat, solver_basic)[0] 
             logging.debug('iter %d f0 = %f f = %f time = %s' % \
                     (iter, f0, f,\
                     str(timer.total(False))))
             param = solver_basic.unflatten_params(param_flat)
         callback = self._args.get('callback', None)
         if callback is None:
             continue
         if type(callback) is not list:
             cb_val = callback(param)
             logging.debug('cb: ' + str(cb_val))
         else:
             cb_val = [cb_func(param) for cb_func in callback]
             logging.debug('cb: ' + ' '.join([str(v) for v in cb_val]))
     # the stochastic part is done. See if we want to do fine-tuning.
     finetune = self._args.get('fine_tune', 0)
     if finetune > 0:
         solver_basic._fminargs['maxfun'] = int(finetune)
         param = solver_basic.solve(X, Y, weight, param)
     return param