def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: logger.info('batch gradient class compiling gradient function') t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() logger.info('done. Took {0}'.format(t2 - t1)) if self.verbose: logger.info('batch gradient class compiling objective function') if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: logger.info('done') self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt( sum([ T.sqr(elem).sum() for elem in self.param_to_grad_shared.values() ])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = \ self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size self._normalize_grad = \ function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = \ sharedX(elem.get_value(), 'old_'+elem.name) self._store_old_grad = \ function([norm], updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm) for g_ in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) # beta_pr is the Polak-Ribiere formula for beta. # According to wikipedia, the beta to use for NCG is "a matter of # heuristics or taste" but max(0, beta_pr) is "a popular choice... # which provides direction reset automatically." (ie, it is meant # to revert to steepest descent when you have traveled far enough # that the objective function is behaving non-quadratically enough # that the conjugate gradient formulas aren't working anymore) # http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method assert grad not in grad_to_old_grad make_conjugate_updates = \ [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line( 'BatchGradientDescent._make_conjugate var ' + var_descriptor(v) + '\n') mode.record.handle_line( 'BatchGradientDescent._make_conjugate update ' + var_descriptor(u) + '\n') self._make_conjugate = \ function([], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line( 'BatchGradientDescent._make_conjugate output ' + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def __init__(self, objective, params, inputs = None, param_constrainers = None, max_iter = -1, lr_scalers = None, verbose = 0, tol = None, init_alpha = None, min_init_alpha = 1e-3, reset_alpha = True, conjugate = False, reset_conjugate = True, gradients = None, gradient_updates = None, line_search_mode = None, accumulate = False, theano_function_mode=None): """ objective: a theano expression to be minimized should be a function of params and, if provided, inputs params: A list of theano shared variables. These are the optimization variables inputs: (Optional) A list of theano variables to serve as inputs to the graph. param_constrainers: (Optional) A list of callables to be called on all updates dictionaries to be applied to params. This is how you implement constrained optimization. reset_alpha: If True, reverts to using init_alpha after each call. If False, the final set of alphas is used at the start of the next call to minimize. conjugate: If True, tries to pick conjugate gradient directions. For the directions to be truly conjugate, you must use line_search_mode = 'exhaustive' and the objective function must be quadratic. Using line_search_mode = 'exhaustive' on a non-quadratic objective function implements nonlinear conjugate gradient descent. reset_conjugate: has no effect unless conjugate == True if reset_conjugate == True, reverts to direction of steepest descent for the first step in each call to minimize. otherwise, tries to make the new search direction conjugate to the last one (even though the objective function might be totally different on each call to minimize) gradients: if None, compute the gradients of obj using T.grad otherwise, a dictionary mapping from params to expressions for their gradients (this allows you to use approximate gradients computed with something other than T.grad) gradient_updates: a dictionary of shared variable updates to run each time the gradient is computed Calling the ``minimize'' method with values for for ``inputs'' will update ``params'' to minimize ``objective''. """ self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [ param for param in params ] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX( param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: print 'batch gradient class compiling gradient function' t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates = updates) else: self._compute_grad = function(inputs, updates = updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() print 'done. Took ',t2-t1 if self.verbose: print 'batch gradient class compiling objective function' if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: print 'done' self.param_to_cache = OrderedDict() alpha = T.scalar(name = 'alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function([], updates = cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size self._normalize_grad = function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_'+elem.name) self._store_old_grad = function([norm], updates = OrderedDict([(grad_to_old_grad[g], g * norm) for g in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [ grad_to_old_grad[g] for g in grad_ordered] def dot_product(x, y): return sum([ (x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y) ]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) """ beta_pr is the Polak-Ribiere formula for beta. According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste" but max(0, beta_pr) is "a popular choice... which provides direction reset automatically." (ie, it is meant to revert to steepest descent when you have traveled far enough that the objective function is behaving non-quadratically enough that the conjugate gradient formulas aren't working anymore) http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method """ assert grad not in grad_to_old_grad make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g]) for g in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \ + var_descriptor(v) + '\n') mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \ + var_descriptor(u) + '\n') self._make_conjugate = function([], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \ + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): """ objective: a theano expression to be minimized should be a function of params and, if provided, inputs params: A list of theano shared variables. These are the optimization variables inputs: (Optional) A list of theano variables to serve as inputs to the graph. param_constrainers: (Optional) A list of callables to be called on all updates dictionaries to be applied to params. This is how you implement constrained optimization. reset_alpha: If True, reverts to using init_alpha after each call. If False, the final set of alphas is used at the start of the next call to minimize. conjugate: If True, tries to pick conjugate gradient directions. For the directions to be truly conjugate, you must use line_search_mode = 'exhaustive' and the objective function must be quadratic. Using line_search_mode = 'exhaustive' on a non-quadratic objective function implements nonlinear conjugate gradient descent. reset_conjugate: has no effect unless conjugate == True if reset_conjugate == True, reverts to direction of steepest descent for the first step in each call to minimize. otherwise, tries to make the new search direction conjugate to the last one (even though the objective function might be totally different on each call to minimize) gradients: if None, compute the gradients of obj using T.grad otherwise, a dictionary mapping from params to expressions for their gradients (this allows you to use approximate gradients computed with something other than T.grad) gradient_updates: a dictionary of shared variable updates to run each time the gradient is computed Calling the ``minimize'' method with values for for ``inputs'' will update ``params'' to minimize ``objective''. """ self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: print 'batch gradient class compiling gradient function' t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() print 'done. Took ', t2 - t1 if self.verbose: print 'batch gradient class compiling objective function' if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: print 'done' self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function([alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt( sum([ T.sqr(elem).sum() for elem in self.param_to_grad_shared.values() ])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = self.new_weight * norm + ( 1. - self.new_weight) * self.ave_grad_size self._normalize_grad = function( [], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = sharedX(elem.get_value(), 'old_' + elem.name) self._store_old_grad = function( [norm], updates=OrderedDict([(grad_to_old_grad[g], g * norm) for g in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g] for g in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) """ beta_pr is the Polak-Ribiere formula for beta. According to wikipedia, the beta to use for NCG is "a matter of heuristics or taste" but max(0, beta_pr) is "a popular choice... which provides direction reset automatically." (ie, it is meant to revert to steepest descent when you have traveled far enough that the objective function is behaving non-quadratically enough that the conjugate gradient formulas aren't working anymore) http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method """ assert grad not in grad_to_old_grad make_conjugate_updates = [(g, g + beta * grad_to_old_grad[g]) for g in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line('BatchGradientDescent._make_conjugate var ' \ + var_descriptor(v) + '\n') mode.record.handle_line('BatchGradientDescent._make_conjugate update ' \ + var_descriptor(u) + '\n') self._make_conjugate = function( [], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line('BatchGradientDescent._make_conjugate output ' \ + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def __init__(self, objective, params, inputs=None, param_constrainers=None, max_iter=-1, lr_scalers=None, verbose=0, tol=None, init_alpha=None, min_init_alpha=1e-3, reset_alpha=True, conjugate=False, reset_conjugate=True, gradients=None, gradient_updates=None, line_search_mode=None, accumulate=False, theano_function_mode=None): self.__dict__.update(locals()) del self.self if line_search_mode is None: if init_alpha is None: init_alpha = (.001, .005, .01, .05, .1) else: assert line_search_mode == 'exhaustive' if init_alpha is None: init_alpha = (.5, 1.) self.init_alpha = tuple([float(elem) for elem in init_alpha]) if inputs is None: inputs = [] if param_constrainers is None: param_constrainers = [] obj = objective self.verbose = verbose param_to_grad_sym = OrderedDict() param_to_grad_shared = OrderedDict() updates = OrderedDict() if self.gradient_updates is not None: updates.update(self.gradient_updates) self.params = [param for param in params] for param in params: if self.gradients is not None and param in self.gradients: g = self.gradients[param] else: g = grad(objective, param) param_to_grad_sym[param] = g if param.name is not None: param_name = param.name else: param_name = 'anon_param' grad_name = 'BatchGradientDescent.grad_' + param_name grad_shared = sharedX(param.get_value() * 0., name=grad_name) param_to_grad_shared[param] = grad_shared updates[grad_shared] = g self.param_to_grad_shared = param_to_grad_shared if self.verbose: logger.info('batch gradient class compiling gradient function') t1 = time.time() if self.accumulate: self._compute_grad = Accumulator(inputs, updates=updates) else: self._compute_grad = function( inputs, updates=updates, mode=self.theano_function_mode, name='BatchGradientDescent._compute_grad') if self.verbose: t2 = time.time() logger.info('done. Took {0}'.format(t2-t1)) if self.verbose: logger.info('batch gradient class compiling objective function') if self.accumulate: self.obj = Accumulator(inputs, obj) else: self.obj = function(inputs, obj, mode=self.theano_function_mode, name='BatchGradientDescent.obj') if self.verbose: logger.info('done') self.param_to_cache = OrderedDict() alpha = T.scalar(name='alpha') alpha.tag.test_value = np.cast[alpha.dtype](.01) cache_updates = OrderedDict() goto_updates = OrderedDict() for param in params: if param.name is None: param_name = 'anon_param' else: param_name = param.name cache_name = 'BatchGradientDescent.param_to_cache[%s]' % param_name self.param_to_cache[param] = sharedX(param.get_value(borrow=False), name=cache_name) cache_updates[self.param_to_cache[param]] = param cached = self.param_to_cache[param] g = self.param_to_grad_shared[param] if lr_scalers is not None and param in lr_scalers: scaled_alpha = alpha * lr_scalers[param] else: scaled_alpha = alpha mul = scaled_alpha * g diff = cached - mul goto_updates[param] = diff self._cache_values = function( [], updates=cache_updates, mode=self.theano_function_mode, name='BatchGradientDescent._cache_values') assert isinstance(param_constrainers, (list, tuple)) for param_constrainer in param_constrainers: param_constrainer(goto_updates) self._goto_alpha = function( [alpha], updates=goto_updates, mode=self.theano_function_mode, name='BatchGradientDescent._goto_alpha') norm = T.sqrt(sum([T.sqr(elem).sum() for elem in self.param_to_grad_shared.values()])) norm.name = 'BatchGradientDescent.norm' normalize_grad_updates = OrderedDict() for grad_shared in self.param_to_grad_shared.values(): normalize_grad_updates[grad_shared] = grad_shared / norm # useful for monitoring self.ave_grad_size = sharedX(0.) self.new_weight = sharedX(1.) normalize_grad_updates[self.ave_grad_size] = \ self.new_weight * norm + (1.-self.new_weight) * self.ave_grad_size self._normalize_grad = \ function([], norm, updates=normalize_grad_updates, mode=self.theano_function_mode, name='BatchGradientDescent._normalize_grad') if self.conjugate: grad_shared = self.param_to_grad_shared.values() grad_to_old_grad = OrderedDict() for elem in grad_shared: grad_to_old_grad[elem] = \ sharedX(elem.get_value(), 'old_'+elem.name) self._store_old_grad = \ function([norm], updates=OrderedDict([(grad_to_old_grad[g_], g_ * norm) for g_ in grad_to_old_grad]), mode=self.theano_function_mode, name='BatchGradientDescent._store_old_grad') grad_ordered = list(grad_to_old_grad.keys()) old_grad_ordered = [grad_to_old_grad[g_] for g_ in grad_ordered] def dot_product(x, y): return sum([(x_elem * y_elem).sum() for x_elem, y_elem in safe_zip(x, y)]) beta_pr = (dot_product(grad_ordered, grad_ordered) - dot_product(grad_ordered, old_grad_ordered)) / \ (1e-7+dot_product(old_grad_ordered, old_grad_ordered)) assert beta_pr.ndim == 0 beta = T.maximum(beta_pr, 0.) # beta_pr is the Polak-Ribiere formula for beta. # According to wikipedia, the beta to use for NCG is "a matter of # heuristics or taste" but max(0, beta_pr) is "a popular choice... # which provides direction reset automatically." (ie, it is meant # to revert to steepest descent when you have traveled far enough # that the objective function is behaving non-quadratically enough # that the conjugate gradient formulas aren't working anymore) # http://en.wikipedia.org/wiki/Nonlinear_conjugate_gradient_method assert grad not in grad_to_old_grad make_conjugate_updates = \ [(g_, g_ + beta * grad_to_old_grad[g_]) for g_ in grad_ordered] mode = self.theano_function_mode if mode is not None and hasattr(mode, 'record'): for v, u in make_conjugate_updates: mode.record.handle_line( 'BatchGradientDescent._make_conjugate var ' + var_descriptor(v) + '\n') mode.record.handle_line( 'BatchGradientDescent._make_conjugate update ' + var_descriptor(u) + '\n') self._make_conjugate = \ function([], updates=make_conjugate_updates, mode=self.theano_function_mode, name='BatchGradientDescent._make_conjugate') if mode is not None and hasattr(mode, 'record'): for output in self._make_conjugate.maker.fgraph.outputs: mode.record.handle_line( 'BatchGradientDescent._make_conjugate output ' + var_descriptor(output) + '\n') if tol is None: if objective.dtype == "float32": self.tol = 1e-6 else: self.tol = 3e-7 else: self.tol = tol self.ave_step_size = sharedX(0.) self.ave_grad_mult = sharedX(0.)
def setup_impl(self, model, dataset, algorithm): cost = algorithm.cost root = model.get_param_vector() dim = root.size rng = self.rng points = rng.randn(self.num_points, self.num_basis_vectors) points = points.astype(root.dtype) points *= self.scale if self.include_root: points[0, :] = 0. if not hasattr(self, 'cost_fn'): # Cargo cult all the Pascal bullshit needed to evaluate the f*****g cost function now # ======================================= data_specs = cost.get_data_specs(model) mapping = DataSpecsMapping(data_specs) space_tuple = mapping.flatten(data_specs[0], return_tuple=True) source_tuple = mapping.flatten(data_specs[1], return_tuple=True) # Build a flat tuple of Theano Variables, one for each space. # We want that so that if the same space/source is specified # more than once in data_specs, only one Theano Variable # is generated for it, and the corresponding value is passed # only once to the compiled Theano function. theano_args = [] for space, source in safe_zip(space_tuple, source_tuple): name = '%s[%s]' % (self.__class__.__name__, source) arg = space.make_theano_batch(name=name, batch_size=self.batch_size) theano_args.append(arg) theano_args = tuple(theano_args) # Methods of `cost` need args to be passed in a format compatible # with data_specs nested_args = mapping.nest(theano_args) fixed_var_descr = cost.get_fixed_var_descr(model, nested_args) self.on_load_batch = fixed_var_descr.on_load_batch cost_value = cost.expr(model, nested_args, ** fixed_var_descr.fixed_vars) # End cargo culting # ====================== print "Compiling cost function..." cost_fn = function(theano_args, cost_value) self.cost_fn = cost_fn else: cost_fn = self.cost_fn cost_values = np.zeros(self.num_points) data = list(dataset.get_batch_design(self.batch_size, include_labels=True)) from pylearn2.utils.one_hot import one_hot data[1] = one_hot(data[1]) if self.method == 'gaussian': basis = rng.normal(dim, self.num_basis_vectors).astype(root.dtype) elif self.method == 'element': basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype) for i in xrange(self.num_basis_vectors): basis[rng.randint(dim), i] = 1. elif self.method == 'gradient': if not hasattr(self, 'grad_fn'): self.grad_fn = function(theano_args, grad(cost_value, model.get_params())) grad_fn = self.grad_fn basis = np.zeros((dim, self.num_basis_vectors)).astype(root.dtype) for i in xrange(self.num_basis_vectors): ipt = list(dataset.get_batch_design(1, include_labels=True)) label = ipt[1] assert label.size == 1 label = label[0] one_hot = np.zeros((1, 10,),dtype='float32') one_hot[0, label] = 1 ipt[1] = one_hot g = grad_fn(*ipt) basis[:,i] = np.concatenate([e.reshape(e.size) for e in g], axis=0) else: assert False basis /= np.sqrt(np.square(basis).sum(axis=0)) # Orthogonalize basis for i in xrange(self.num_basis_vectors): v = basis[:,i ].copy() for j in xrange(i - 1): u = basis[:, j].copy() v -= np.dot(u, v) * u norm = np.sqrt(np.square(v).sum()) assert norm > 1e-4 v /= norm basis[:,i] = v for i in xrange(self.num_points): print "Evaluating cost at point ", i point = points[i, :] full_point = root + np.dot(basis, point) model.set_param_vector(full_point) cost_values[i] = cost_fn(*data) print cost_values[i] from pylearn2.utils import sharedX import theano.tensor as T print "!!!!!!!! FITTING THE QUADRATIC FUNCTION !!!!!!!!!!!!!!!!!!!" if not hasattr(self, 'fit_quad'): points = sharedX(points) #from theano import config #config.compute_test_value = 'raise' cost_values = sharedX(cost_values) A = sharedX(np.zeros((self.num_basis_vectors, self.num_basis_vectors))) if self.psd: mat = T.dot(A.T, A) else: mat = A b = sharedX(np.zeros(self.num_basis_vectors)) c = sharedX(0.) half_quad = T.dot(points, mat) quad = (points * half_quad).sum(axis=1) lin = T.dot(points, b) pred = quad + lin + c from pylearn2.optimization.batch_gradient_descent import BatchGradientDescent mse = T.square(pred - cost_values).mean() mae = abs(pred - cost_values).mean() obj = locals()[self.fitting_cost] fit_quad = BatchGradientDescent(obj, params = [A, b, c], max_iter = self.num_basis_vectors ** 2, verbose = 3, tol = None, init_alpha = None, min_init_alpha = 1e-7, reset_alpha = False, conjugate = True, reset_conjugate = False, line_search_mode = 'exhaustive') self.fit_quad = fit_quad self.A = A self.b = b self.c = c self.points = points self.cost_values = cost_values else: self.A.set_value(.001 * np.identity(self.A.get_value().shape[0], dtype=self.A.dtype)) self.b.set_value(self.b.get_value() * 0.) self.c.set_value(self.c.get_value() * 0.) self.points.set_value(points) self.cost_values.set_value(cost_values.astype(self.cost_values.dtype)) self.fit_quad.minimize() print "!!!!!!!!!!!!! FINDING ITS MINIMUM !!!!!!!!!!!!!!!!!!!!!!!!!!!" if self.use_solver: if self.psd: Av = self.A.get_value() mat_v = np.dot(Av.T, Av) else: mat_v = self.A.get_value() bv = self.b.get_value() # minimize for x^T A x + b^T x + c # -> solve 2 A x + b = 0 # Ax = - b / 2 print "********** mat_v", mat_v.min(), mat_v.max() x, ignored_residuals, ignored_rank, ignored_singular_values = np.linalg.lstsq(mat_v, - 0.5 * bv) print "********** soln: ", x.min(), x.mean(), x.max() print "********** SVs: ", ignored_singular_values.min(), ignored_singular_values.max() assert x.ndim == 1, x.shape prod = np.dot(basis, x) norm = np.sqrt(np.square(prod).sum()) print "*************** Moving params by ",norm vector = root + prod model.set_param_vector(vector) else: # use minimizer if not hasattr(self, 'fit_params'): self.vector = sharedX(points.get_value().mean(axis=0)) vector = self.vector obj = T.dot(T.dot(mat, vector), vector) + T.dot(b, vector) def constrain(d): assert vector in d n = d[vector] norm = T.sqrt(T.square(n).sum()) desired_norm = T.clip(norm, 0., self.max_jump_norm) d[vector] = n * desired_norm / norm self.fit_params = BatchGradientDescent(obj, params=[vector], max_iter = self.num_basis_vectors, verbose = 3, tol=None, param_constrainers = [constrain], init_alpha = None, min_init_alpha = 1e-3, reset_alpha=False, conjugate=True, reset_conjugate=False, line_search_mode='exhaustive') else: self.vector.set_value(points.mean(axis=0).astype(self.vector.dtype)) self.fit_params.minimize() model.set_param_vector(root + np.dot(basis , self.vector.get_value()))