def get_gradient(self, starting_gradient=None, cost=None, additional_cost=None): """ This method allows you to define the gradient for this model manually. It should either work with a provided starting gradient (from upstream layers/models), or grab the training cost if no start gradient is provided. Theano's subgraph gradient function specified here: http://deeplearning.net/software/theano/library/gradient.html#theano.gradient.subgraph_grad .. warning:: If the gradients of cost with respect to any of the start variables is already part of the start dictionary, then it may be counted twice with respect to wrt (`get_params()`) and end (`get_inputs()`). You should only implement this method if you want to manually define your gradients for the model. Parameters ---------- starting_gradient : dictionary of {variable: known_gradient}, optional The starting, known gradients for parameters. cost : theano expression, optional The cost expression to use when calculating the gradients. Defaults to `get_train_cost()`. additional_cost : theano expression, optional Any additional cost to add to the gradient. Returns ------- tuple (Gradient with respect to params, gradient with respect to inputs) """ # check if starting gradients was provided. # if there are known gradients to start, use those instead of the cost for this model if starting_gradient is not None: params_grad, next_starting_grad = theano.subgraph_grad( wrt=self.get_params(), end=raise_to_list(self.get_inputs()), start=starting_gradient, cost=additional_cost, details=False) # otherwise, just use this model's cost to determine gradient else: # use the cost if it was given cost = cost or self.get_train_cost() if additional_cost is not None: cost = T.sum(cost, additional_cost) params_grad, next_starting_grad = theano.subgraph_grad( wrt=self.get_params(), end=raise_to_list(self.get_inputs()), cost=cost, details=False) return (OrderedDict(zip(self.get_params(), params_grad)), OrderedDict( zip(raise_to_list(self.get_inputs()), next_starting_grad)))
def compile_grad_functions(split_outputs, param_blocks, input_vars, loss, givens): """ Compiles functions that compute the gradients for each block given the preceding block. :return: """ grad_fns = [] for i in range(len(param_blocks) - 1, -1, -1): if i > 0: end = split_outputs[i - 1] else: end = [] if i < len(split_outputs): # Create gradient variables for all split vars start = OrderedDict() for s in split_outputs[i]: start[s] = like(s) start_vars = list(start.values()) else: start = None start_vars = [] if start is None: grads, next = subgraph_grad( start=start, end=end, cost=loss, wrt=param_blocks[i] ) # Create the grad function grad_fns.append(theano.function( inputs=input_vars + start_vars, outputs=[loss] + grads + next, on_unused_input='ignore', givens=givens )) else: grads, next = subgraph_grad( start=start, end=end, wrt=param_blocks[i] ) # Create the grad function grad_fns.append(theano.function( inputs=input_vars + start_vars, outputs=grads + next, on_unused_input='ignore', givens=givens )) return grad_fns[::-1]
def compile_grad_functions(split_outputs, param_blocks, input_vars, loss, givens): """Compiles functions that compute the gradients for each block. Args: split_outputs: The split nodes. param_blocks: The parameters for each block. input_vars: The input variables to the network. loss: The training loss. givens: A dictionary of given variable values (computed during a dedicated forward pass). """ grad_fns = [] for i in range(len(param_blocks) - 1, -1, -1): if i > 0: end = split_outputs[i - 1] else: end = [] if i < len(split_outputs): # Create gradient variables for all split vars start = collections.OrderedDict() for s in split_outputs[i]: start[s] = _like(s) start_vars = list(start.values()) else: start = None start_vars = [] if start is None: grads, out_grads = subgraph_grad(end=end, cost=loss, wrt=param_blocks[i]) # Create the grad function grad_fns.append( theano.function(inputs=input_vars + start_vars, outputs=[loss] + grads + out_grads, on_unused_input='ignore')) else: grads, out_grads = subgraph_grad(start=start, end=end, wrt=param_blocks[i]) # Create the grad function grad_fns.append( theano.function(inputs=input_vars + start_vars, outputs=grads + out_grads, on_unused_input='ignore', givens=givens)) return grad_fns[::-1]
def get_gradient(self, starting_gradient=None, cost=None, additional_cost=None): """ This method allows you to define the gradient for this model manually. It should either work with a provided starting gradient (from upstream layers/models), or grab the training cost if no start gradient is provided. Theano's subgraph gradient function specified here: http://deeplearning.net/software/theano/library/gradient.html#theano.gradient.subgraph_grad .. warning:: If the gradients of cost with respect to any of the start variables is already part of the start dictionary, then it may be counted twice with respect to wrt (`get_params()`) and end (`get_inputs()`). You should only implement this method if you want to manually define your gradients for the model. Parameters ---------- starting_gradient : dictionary of {variable: known_gradient}, optional The starting, known gradients for parameters. cost : theano expression, optional The cost expression to use when calculating the gradients. Defaults to `get_train_cost()`. additional_cost : theano expression, optional Any additional cost to add to the gradient. Returns ------- tuple (Gradient with respect to params, gradient with respect to inputs) """ # check if starting gradients was provided. # if there are known gradients to start, use those instead of the cost for this model if starting_gradient is not None: params_grad, next_starting_grad = theano.subgraph_grad(wrt=self.get_params(), end=raise_to_list(self.get_inputs()), start=starting_gradient, cost=additional_cost, details=False) # otherwise, just use this model's cost to determine gradient else: # use the cost if it was given cost = cost or self.get_train_cost() if additional_cost is not None: cost = T.sum(cost, additional_cost) params_grad, next_starting_grad = theano.subgraph_grad(wrt=self.get_params(), end=raise_to_list(self.get_inputs()), cost=cost, details=False) return (OrderedDict(zip(self.get_params(), params_grad)), OrderedDict(zip(raise_to_list(self.get_inputs()), next_starting_grad)))
def compile_grad_functions(split_outputs, param_blocks, input_vars, loss, givens): """ Compiles functions that compute the gradients for each block given the preceding block. :return: """ grad_fns = [] for i in range(len(param_blocks) - 1, -1, -1): if i > 0: end = split_outputs[i - 1] else: end = [] if i < len(split_outputs): # Create gradient variables for all split vars start = OrderedDict() for s in split_outputs[i]: start[s] = like(s) start_vars = list(start.values()) else: start = None start_vars = [] if start is None: grads, next = subgraph_grad(start=start, end=end, cost=loss, wrt=param_blocks[i]) # Create the grad function grad_fns.append( theano.function(inputs=input_vars + start_vars, outputs=[loss] + grads + next, on_unused_input='ignore', givens=givens)) else: grads, next = subgraph_grad(start=start, end=end, wrt=param_blocks[i]) # Create the grad function grad_fns.append( theano.function(inputs=input_vars + start_vars, outputs=grads + next, on_unused_input='ignore', givens=givens)) return grad_fns[::-1]
def test_subgraph_grad(): # Tests that the grad method with no known_grads # matches what happens if you use successive subgraph_grads x = theano.tensor.fvector('x') t = theano.tensor.fvector('t') w1 = theano.shared(np.random.randn(3, 4)) w2 = theano.shared(np.random.randn(4, 2)) a1 = theano.tensor.tanh(theano.tensor.dot(x, w1)) a2 = theano.tensor.tanh(theano.tensor.dot(a1, w2)) cost2 = theano.tensor.sqr(a2 - t).sum() cost2 += theano.tensor.sqr(w2.sum()) cost1 = theano.tensor.sqr(w1.sum()) params = [[w2], [w1]] costs = [cost2, cost1] grad_ends = [[a1], [x]] inputs = [t, x] rng = np.random.RandomState([2012, 11, 15]) values = [rng.randn(2), rng.randn(3)] values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)] wrt = [w2, w1] cost = cost2 + cost1 true_grads = theano.grad(cost, wrt) true_grads = theano.function(inputs, true_grads) true_grads = true_grads(*values) from theano.compat.python2x import OrderedDict next_grad = None param_grads = [] for i in xrange(2): param_grad, next_grad = theano.subgraph_grad(wrt=params[i], end=grad_ends[i], start=next_grad, cost=costs[i]) next_grad = OrderedDict(zip(grad_ends[i], next_grad)) param_grads.extend(param_grad) pgrads = theano.function(inputs, param_grads) pgrads = pgrads(*values) for true_grad, pgrad in zip(true_grads, pgrads): assert (np.sum(np.abs(true_grad - pgrad)) < 0.00001)
def test_subgraph_grad(): # Tests that the grad method with no known_grads # matches what happens if you use successive subgraph_grads x = theano.tensor.fvector('x') t = theano.tensor.fvector('t') w1 = theano.shared(np.random.randn(3,4)) w2 = theano.shared(np.random.randn(4,2)) a1 = theano.tensor.tanh(theano.tensor.dot(x,w1)) a2 = theano.tensor.tanh(theano.tensor.dot(a1,w2)) cost2 = theano.tensor.sqr(a2 - t).sum() cost2 += theano.tensor.sqr(w2.sum()) cost1 = theano.tensor.sqr(w1.sum()) params = [[w2],[w1]] costs = [cost2,cost1] grad_ends = [[a1], [x]] inputs = [t, x] rng = np.random.RandomState([2012, 11, 15]) values = [rng.randn(2), rng.randn(3)] values = [np.cast[ipt.dtype](value) for ipt, value in zip(inputs, values)] wrt = [w2, w1] cost = cost2 + cost1 true_grads = theano.grad(cost, wrt) true_grads = theano.function(inputs, true_grads) true_grads = true_grads(*values) from theano.compat.python2x import OrderedDict next_grad = None param_grads = [] for i in xrange(2): param_grad, next_grad = theano.subgraph_grad( wrt=params[i], end=grad_ends[i], start=next_grad, cost=costs[i] ) next_grad = OrderedDict(zip(grad_ends[i], next_grad)) param_grads.extend(param_grad) pgrads = theano.function(inputs, param_grads) pgrads = pgrads(*values) for true_grad, pgrad in zip(true_grads, pgrads): assert(np.sum(np.abs(true_grad - pgrad)) < 0.00001)
def compile_grad_descent_functions(bn_updates, split_updates, split_outputs, param_blocks, input_vars, loss, givens, update_fn): """Compiles functions that perform a gradient descent stop for each block. This function is complementary to `compile_grad_functions`. Args: bn_updates: A dictionary for updating the BN statistics. split_updates: A dictionary containing the update ops for the intermediate split outputs. split_outputs: The split nodes. param_blocks: The parameters for each block. input_vars: The input variables to the network. loss: The training loss. givens: A dictionary of given variable values (computed during a dedicated forward pass). update_fn: A lasagne update function that takes a list of gradient and parameters and returns a theano update dict. """ grad_fns = [] for i in range(len(param_blocks) - 1, -1, -1): if i > 0: end = split_outputs[i - 1] else: end = [] if i < len(split_outputs): # Create gradient variables for all split vars start = collections.OrderedDict() for s in split_outputs[i]: start[s] = _like(s) start_vars = list(start.values()) else: start = None start_vars = [] if start is None: grads, out_grads = subgraph_grad( end=end, cost=loss, wrt=param_blocks[i], ) # Compute the gradient descent update updates = update_fn(loss_or_grads=grads, params=param_blocks[i]) # Update the BN statistics and store the intermediate outputs. updates.update(bn_updates) updates.update(split_updates) # Create the grad function grad_fns.append( theano.function(inputs=input_vars + start_vars, outputs=[loss] + out_grads, updates=updates, on_unused_input='ignore')) else: grads, out_grads = subgraph_grad(start=start, end=end, wrt=param_blocks[i]) # Compute the gradient descent update updates = update_fn(loss_or_grads=grads, params=param_blocks[i]) # Create the grad function grad_fns.append( theano.function(inputs=input_vars + start_vars, outputs=out_grads, updates=updates, on_unused_input='ignore', givens=givens)) return grad_fns