def __init__(self, steps = 1, num_layers = 2, num_units = 32, eps = 1e-2): self.X, self.Z = T.fvectors('X','Z') self.P, self.Q, self.R = T.fmatrices('P','Q','R') self.dt = T.scalar('dt') self.matrix_inv = T.nlinalg.MatrixInverse() self.ar = AutoRegressiveModel(steps = steps, num_layers = num_layers, num_units = num_units, eps = eps) l = InputLayer(input_var = self.X, shape = (steps,)) l = ReshapeLayer(l, shape = (1,steps,)) l = self.ar.network(l) l = ReshapeLayer(l, shape=(1,)) self.l_ = l self.f_ = get_output(self.l_) self.X_ = T.concatenate([self.f_, T.dot(T.eye(steps)[:-1], self.X)], axis=0) self.fX_ = G.jacobian(self.X_.flatten(), self.X) self.P_ = T.dot(T.dot(self.fX_, self.P), T.transpose(self.fX_)) + \ T.dot(T.dot(T.eye(steps)[:,0:1], self.dt * self.Q), T.eye(steps)[0:1,:]) self.h = T.dot(T.eye(steps)[0:1], self.X_) self.y = self.Z - self.h self.hX_ = G.jacobian(self.h, self.X_) self.S = T.dot(T.dot(self.hX_, self.P_), T.transpose(self.hX_)) + self.R self.K = T.dot(T.dot(self.P_, T.transpose(self.hX_)), self.matrix_inv(self.S)) self.X__ = self.X_ + T.dot(self.K, self.y) self.P__ = T.dot(T.identity_like(self.P) - T.dot(self.K, self.hX_), self.P_) self.prediction = theano.function(inputs = [self.X, self.P, self.Q, self.dt], outputs = [self.X_, self.P_], allow_input_downcast = True) self.update = theano.function(inputs = [self.X, self.Z, self.P, self.Q, self.R, self.dt], outputs = [self.X__, self.P__], allow_input_downcast = True)
def initialize_calc_ll_gmm_fun(self): Yvec = T.dvector('Y') meansvec = T.dvector('means') covarsvec = T.dvector('covars') weights = T.dvector('weights') lam = T.dscalar('lambda') ndim = meansvec.shape[0]/self.gm_num Y = T.reshape(Yvec, (Yvec.shape[0]/ndim, ndim)) LL, p1, p2 = self.calc_ll_gmm(Y, T.reshape(meansvec, (self.gm_num, meansvec.shape[0]/self.gm_num)), T.reshape(covarsvec, (self.gm_num, meansvec.shape[0]/self.gm_num)), weights) LL_lag = T.sum(LL)+lam*(T.sum(weights)-1) LL_sum = T.sum(LL) self.gmm_f = function([Yvec, meansvec, covarsvec, weights, lam], LL_lag) LLg = gradient.jacobian(LL_lag, [Yvec, meansvec, covarsvec, weights, lam]) LL_sum_g = gradient.jacobian(LL_sum, [Yvec, meansvec, covarsvec, weights]) llhm = gradient.jacobian(LLg[1], [Yvec, meansvec, covarsvec, weights]) llhc = gradient.jacobian(LLg[2], [Yvec, meansvec, covarsvec, weights]) llhw = gradient.jacobian(LLg[3], [Yvec, meansvec, covarsvec, weights, lam]) self.gmm_df = function([Yvec, meansvec, covarsvec, weights], LL_sum_g) self.gmm_hm = function([Yvec, meansvec, covarsvec, weights, lam], llhm) self.gmm_hc = function([Yvec, meansvec, covarsvec, weights, lam], llhc) self.gmm_hw = function([Yvec, meansvec, covarsvec, weights, lam], llhw)
def apply(self, y, y_hat, biases): cost = tensor.nnet.categorical_crossentropy(y_hat, y.flatten()) predicted = y_hat.argmax(axis=1) # Here we just count the number of unit biases with nonzero gradient jacobians = gradient.jacobian(cost, biases) counts = tensor.zeros_like(y) for j in jacobians: counts += tensor.neq(j, 0).sum(axis=1) return counts
def grad(self): #print(self.model) params = self.model.weights netInputs = self.model.input netOutputs = numpy.log(self.model.output.flatten()) gradients = [jacobian(netOutputs, w) for w in self.model.weights] return K.function(inputs=[netInputs], outputs=gradients, updates=self.model.state_updates)
def _compute_jacobians(self): if self.case_costs is None or self.case_costs.ndim == 0: raise ValueError("can't infer jacobians; no case_costs specified") elif self.intpic_parameters is None or len(self.parameters) == 0: raise ValueError("can't infer jacobians; no parameters specified") logging.info("Taking the intpic jacobians") jacobians = gradient.jacobian(self.case_costs, self.intpic_parameters) jacobian_map = OrderedDict(equizip(self.intpic_parameters, jacobians)) logging.info("The intpic jacobian computation graph is built") return jacobian_map
def apply(self, y, y_hat, biases, outs): cost = tensor.nnet.categorical_crossentropy(y_hat, y.flatten()) predicted = y_hat.argmax(axis=1) # Here we just count the number of unit biases with nonzero gradient jacobians = gradient.jacobian(cost, biases) counts = tensor.zeros_like(y) for j, o in zip(jacobians, outs): while o.ndim > 2: o = o.max(axis=o.ndim - 1) counts += (tensor.gt(o, 0) * tensor.eq(j, 0)).sum(axis=1) return counts
def test_gn_product_rnn(): raise SkipTest() np.random.seed(1010) n_timesteps = 3 n_inpt = 3 n_output = 2 rnn = SupervisedRnn(n_inpt, 1, n_output, out_transfer='sigmoid', loss='squared') rnn.parameters.data[:] = np.random.normal(0, 1, rnn.parameters.data.shape) X = np.random.random((n_timesteps, 1, n_inpt)).astype(theano.config.floatX) Z = np.random.random( (n_timesteps, 1, n_output)).astype(theano.config.floatX) # Calculate the GN explicitly. # Shortcuts. loss = rnn.exprs['loss'] output_in = rnn.exprs['output_in'] p = T.vector('some-vector') J = jacobian(output_in[:, 0, :].flatten(), rnn.parameters.flat) little_J = T.grad(loss, output_in)[:, 0, :] little_H = [[T.grad(little_J[i, j], output_in) for j in range(n_output)] for i in range(n_timesteps)] f_J = rnn.function(['inpt'], J) f_H = rnn.function(['inpt', 'target'], little_H) J_ = f_J(X) H_ = np.array(f_H(X, Z))[:, :, :, 0, :] H_.shape = H_.shape[0] * H_.shape[1], H_.shape[2] * H_.shape[3] G_expl = np.dot(J_.T, np.dot(H_, J_)) p = np.random.random(rnn.parameters.data.shape) Gp_expl = np.dot(G_expl, p) Hp = rnn._gauss_newton_product() args = list(rnn.data_arguments) f_Hp = rnn.function(['some-vector'] + args, Hp, explicit_pars=True) Gp = f_Hp(rnn.parameters.data, p, X, Z) assert np.allclose(Gp, Gp_expl)
def test_gn_product_rnn(): raise SkipTest() np.random.seed(1010) n_timesteps = 3 n_inpt = 3 n_output = 2 rnn = SupervisedRnn(n_inpt, [1], n_output, out_transfer='sigmoid', loss='squared') rnn.parameters.data[:] = np.random.normal(0, 1, rnn.parameters.data.shape) X = np.random.random((n_timesteps, 1, n_inpt)).astype(theano.config.floatX) Z = np.random.random((n_timesteps, 1, n_output) ).astype(theano.config.floatX) X, Z = theano_floatx(X, Z) # Calculate the GN explicitly. # Shortcuts. loss = rnn.exprs['loss'] output_in = rnn.exprs['output_in'] p = T.vector('some-vector') J = jacobian(output_in[:, 0, :].flatten(), rnn.parameters.flat) little_J = T.grad(loss, output_in)[:, 0, :] little_H = [[T.grad(little_J[i, j], output_in) for j in range(n_output)] for i in range(n_timesteps)] f_J = rnn.function(['inpt'], J) f_H = rnn.function(['inpt', 'target'], little_H) J_ = f_J(X) H_ = np.array(f_H(X, Z))[:, :, :, 0, :] H_.shape = H_.shape[0] * H_.shape[1], H_.shape[2] * H_.shape[3] G_expl = np.dot(J_.T, np.dot(H_, J_)) p = np.random.random(rnn.parameters.data.shape) Gp_expl = np.dot(G_expl, p) Hp = rnn._gauss_newton_product() args = list(rnn.data_arguments) f_Hp = rnn.function( ['some-vector'] + args, Hp, explicit_pars=True) Gp = f_Hp(rnn.parameters.data, p, X, Z) assert np.allclose(Gp, Gp_expl)
def __init__(self, state = 'x', measurement = 'z', motion_transition = None, measurement_transition = None): self.N = len(state.split(' ')) self.M = len(measurement.split(' ')) self.X, self.Z = T.fvectors('X','Z') self.P, self.Q, self.R = T.fmatrices('P','Q','R') self.F, self.H = T.matrices('F','H') self.dt = T.scalar('dt') self.X_ = T.dot(self.F, self.X) self.fX_ = G.jacobian(T.flatten(self.X_), self.X) self.P_ = T.dot(T.dot(self.fX_, self.P), T.transpose(self.fX_)) + self.dt * self.Q self.h = T.dot(self.H, self.X_) self.y = self.Z - self.h self.hX_ = G.jacobian(self.h, self.X_) self.matrix_inv = T.nlinalg.MatrixInverse() self.S = T.dot(T.dot(self.hX_, self.P_), T.transpose(self.hX_)) + self.R self.K = T.dot(T.dot(self.P_, T.transpose(self.hX_)), self.matrix_inv(self.S)) self.X__ = self.X_ + T.dot(self.K, self.y) self.P__ = T.dot(T.identity_like(self.P) - T.dot(self.K, self.hX_), self.P_) self.prediction = theano.function(inputs = [self.X, self.P, self.Q, self.F, self.dt], outputs = [self.X_, self.P_], allow_input_downcast = True) self.update = theano.function(inputs = [self.X, self.Z, self.P, self.Q, self.R, self.F, self.H, self.dt], outputs = [self.X__, self.P__], allow_input_downcast = True) if motion_transition == None: self.motion_transition = np.eye(self.N) else: self.motion_transition = np.array(motion_transition) if measurement_transition == None: self.measurement_transition = np.eye(self.M) else: self.measurement_transition = np.array(motion_transition)
def contractive_regularizer(op, examples): jacobian = G.jacobian(op.flatten(), examples) regularizer = T.sum(T.abs_(jacobian) ** 2) return regularizer
def fProp(): """ Returns the Theano-style forward propagation and gradient calculation function The generalized function will have the following structure: [o, J, dIN, dW, dOUT] = fProp(X,Y,IN,W,OUT,L) Inputs: X: numpy array containing the examples to be forward propagated. Each row is an example, each column is a feature. Y: target values, each item has to correspond to an example in X. If not training just pass an array with zeros IN: numpy 2D array with weights that map input layer to first hidden layer W: numpy 3D array with weights that map within hidden layers. W[:,:,i] corresponds to the weights mapping from hidden layer i to hidden layer i+1 OUT: numpy 2D array that maps from last hidden layer to output unit L: regularization parameter Outputs: o: output for each example in X J: cost calculated using negative log-likelihood dIN, dW, dOUT: partial derivatives of the cost with respect to the weights This function was developed in the Multiscale Cardiovascular Engineering Group (MUSE) at University College London by Carlos Ledezma. """ import theano.tensor as T from theano import function from theano.gradient import jacobian from theano import scan # Define the forward propagation function # This function will process all examples at the same time L = T.dscalar('L') # Regularization term X = T.dmatrix('X') # Input cases numEx = T.shape(X)[0] Y = T.dmatrix('Y') # Target IN = T.dmatrix('IN') # ANN weights mapping input layer to first hidden layer W = T.dtensor3('W') # ANN weigths mapping between hidden layers OUT = T.dmatrix('OUT') # ANN weights mapping last hidden layer to output # Start forward prop by mapping inputs to first hidden layer Xb = T.concatenate([T.ones((numEx,1)),X],axis=1) # Add bias term a = T.dot(IN,Xb.T) # Linear combination of inputs A = T.nnet.relu(a) # ReLU ''' Propagate through the network Each step is as follows: actb = T.concatenate([T.ones((1,numEx)),act], axis=0) # Add bias term b = T.dot(W[:,:,i],actb) # Linear combination of inputs B = T.nnet.relu(b) # ReLU ''' B, update = scan(lambda i, act,W: T.nnet.relu(T.dot(W[:,:,i], T.concatenate([T.ones((1,numEx)),act], axis=0))),\ sequences=T.arange(W.shape[2]),\ outputs_info=A,\ non_sequences=W) B_final = B[-1] # Map to output layer Bb = T.concatenate([T.ones((1,numEx)),B_final], axis=0) # Add bias term o = T.dot(OUT,Bb) # Linear combination of inputs o = T.nnet.sigmoid(o) # Sigmoid for classification output J = T.nnet.nnet.binary_crossentropy(o,Y).sum() / numEx# Calculate cost J += L/(2*numEx) * ((W**2).sum().sum().sum() + (OUT**2).sum().sum() + (IN**2).sum().sum())# Add regularization # Calculate jacobians of cost dIN = jacobian(J,IN) dW = jacobian(J,W) dOUT = jacobian(J,OUT) return function([X,Y,IN,W,OUT,L],[o,J,dIN,dW,dOUT])
import theano import theano.tensor as T import theano.gradient as grad from theano import function x = T.vector('x') y = x ** 2 J, updates = theano.scan(lambda i, y, x: T.grad(y[i], x), sequences=T.arange(y.shape[0]), non_sequences=[y, x]) f = function([x], J, updates=updates) print f([4, 4]) f_grad = function([x], grad.jacobian(y, x)) print f_grad([4, 4])
def _compute_jacobians(components, parameters): logging.info("Taking the component jacobians") jacobians = gradient.jacobian(components, parameters) jacobian_map = OrderedDict(equizip(parameters, jacobians)) logging.info("The component jacobian computation graph is built") return jacobian_map
def __init__(self, n_inputs, n_hidden_units, n_hidden_layers, n_outputs, hidden_activation = 'tanh', weight_l2 = 1e-6): self.n_inputs = n_inputs self.n_hidden_units = n_hidden_units self.n_hidden_layers = n_hidden_layers if hidden_activation == 'tanh': self.hidden_activation = T.tanh elif hidden_activation == 'sigmoid': self.hidden_activation = T.nnet.sigmoid elif hidden_activation == 'softplus': self.hidden_activation = T.nnet.softplus elif hidden_activation == 'relu': self.hidden_activation = lambda x: T.maximum(0, x) else: raise NotImplementedError self.n_outputs = n_outputs self.n_hidden_activation = hidden_activation self.n_hidden = [n_hidden_units,]*n_hidden_layers self.activations = [self.hidden_activation,]*self.n_hidden_layers self.activations.extend([T.nnet.softmax,]) # NOTE: The last function goes to the output layer. assert len(self.n_hidden) + 1 == len(self.activations) # Model definition. x = T.fmatrix('X') self.params = [] # Keep model params here. # Build the layered neural network. y = x layers = [self.n_inputs] + self.n_hidden + [self.n_outputs] # Iterate over pairs of adjacent layers. for i, (n1, n2, act) in enumerate(zip(layers[:-1], layers[1:], self.activations)): w = theano.shared( np.asarray(rng.uniform( low=-np.sqrt(6. / (n1 + n2)), high=np.sqrt(6. / (n1 + n2)), size=(n1, n2)), dtype=np.float32), 'W%d' % i, borrow=True) b = theano.shared(np.zeros(n2, dtype=np.float32), 'b%d' % (i + 1)) self.params.append((w, b)) y = act(T.dot(y, w) + b) self.f_y = function([x], y) # PREDICTION FUNCTION # Define the loss function. true_y = T.ivector('true_Y') # The desired output vector. loss = -T.log(y[T.arange(y.shape[0]), true_y]) # Negative log-likelihood. toss = T.sum(loss) # SUM negative log-likelihood. # Add regularization. l2 = 0 for w, b in self.params: l2 += (w**2).sum() + (b**2).sum() loss += weight_l2 * l2 self.f_toss = function([x, true_y], toss, allow_input_downcast=True) # Derive the gradients for the parameters. self.f_g_losses = [] self.f_j_losses = [] for w, b in self.params: g_loss = T.grad(toss, wrt=[w, b]) f_g_loss = function([x, true_y], g_loss) self.f_g_losses.append(f_g_loss) j_loss = jacobian(loss, wrt=[w, b]) # j_loss, updates = theano.scan(lambda i: T.grad(loss[i], [w, b]), sequences=T.arange(loss.shape[0]), non_sequences=[]) f_j_loss = function([x, true_y], j_loss) self.f_j_losses.append(f_j_loss) self.rprop_init() self.adalr_init()
def __init__(self, n_inputs, n_hidden_units, n_hidden_layers, n_outputs, hidden_activation='tanh', weight_l2=1e-6): self.n_inputs = n_inputs self.n_hidden_units = n_hidden_units self.n_hidden_layers = n_hidden_layers if hidden_activation == 'tanh': self.hidden_activation = T.tanh elif hidden_activation == 'sigmoid': self.hidden_activation = T.nnet.sigmoid elif hidden_activation == 'softplus': self.hidden_activation = T.nnet.softplus elif hidden_activation == 'relu': self.hidden_activation = lambda x: T.maximum(0, x) else: raise NotImplementedError self.n_outputs = n_outputs self.n_hidden_activation = hidden_activation self.n_hidden = [ n_hidden_units, ] * n_hidden_layers self.activations = [ self.hidden_activation, ] * self.n_hidden_layers self.activations.extend([ T.nnet.softmax, ]) # NOTE: The last function goes to the output layer. assert len(self.n_hidden) + 1 == len(self.activations) # Model definition. x = T.fmatrix('X') self.params = [] # Keep model params here. # Build the layered neural network. y = x layers = [self.n_inputs] + self.n_hidden + [self.n_outputs] # Iterate over pairs of adjacent layers. for i, (n1, n2, act) in enumerate( zip(layers[:-1], layers[1:], self.activations)): w = theano.shared(np.asarray(rng.uniform( low=-np.sqrt(6. / (n1 + n2)), high=np.sqrt(6. / (n1 + n2)), size=(n1, n2)), dtype=np.float32), 'W%d' % i, borrow=True) b = theano.shared(np.zeros(n2, dtype=np.float32), 'b%d' % (i + 1)) self.params.append((w, b)) y = act(T.dot(y, w) + b) self.f_y = function([x], y) # PREDICTION FUNCTION # Define the loss function. true_y = T.ivector('true_Y') # The desired output vector. loss = -T.log(y[T.arange(y.shape[0]), true_y]) # Negative log-likelihood. toss = T.sum(loss) # SUM negative log-likelihood. # Add regularization. l2 = 0 for w, b in self.params: l2 += (w**2).sum() + (b**2).sum() loss += weight_l2 * l2 self.f_toss = function([x, true_y], toss, allow_input_downcast=True) # Derive the gradients for the parameters. self.f_g_losses = [] self.f_j_losses = [] for w, b in self.params: g_loss = T.grad(toss, wrt=[w, b]) f_g_loss = function([x, true_y], g_loss) self.f_g_losses.append(f_g_loss) j_loss = jacobian(loss, wrt=[w, b]) # j_loss, updates = theano.scan(lambda i: T.grad(loss[i], [w, b]), sequences=T.arange(loss.shape[0]), non_sequences=[]) f_j_loss = function([x, true_y], j_loss) self.f_j_losses.append(f_j_loss) self.rprop_init() self.adalr_init()
def contractive_regularizer(op, examples): jacobian = G.jacobian(op.flatten(), examples) regularizer = T.sum(T.abs_(jacobian)**2) return regularizer
import theano import theano.tensor as T import theano.gradient as grad from theano import function x = T.vector('x') y = x**2 J, updates = theano.scan(lambda i, y, x: T.grad(y[i], x), sequences=T.arange(y.shape[0]), non_sequences=[y, x]) f = function([x], J, updates=updates) print f([4, 4]) f_grad = function([x], grad.jacobian(y, x)) print f_grad([4, 4])