def get_output_for(self, inputs, **kwargs): input = inputs[0] hid_init = None if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] # Input should be provided as (n_batch, n_time_steps, n_features) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, *range(2, input.ndim)) seq_len, num_batch = input.shape[0], input.shape[1] # precompute inputs before scanning trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim)) input = T.reshape(input, (seq_len * num_batch, ) + trailing_dims) input = helper.get_output(self.input_to_hidden, input, **kwargs) # Reshape back to (seq_len, batch_size, trailing dimensions...) trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim)) input = T.reshape(input, (seq_len, num_batch) + trailing_dims) # pass params to step non_seqs = helper.get_all_params(self.hidden_to_hidden) non_seqs += helper.get_all_params(self.post_concat) # Create single recurrent computation step function def step(input_n, hid_previous, *args): # Compute the hidden-to-hidden activation hid_pre = helper.get_output(self.hidden_to_hidden, hid_previous, **kwargs) hid_pre = T.concatenate([hid_pre, input_n], axis=1) hid_pre = helper.get_output(self.post_concat, hid_pre, **kwargs) if self.grad_clipping: hid_pre = theano.gradient.grad_clip(hid_pre, -self.grad_clipping, self.grad_clipping) return hid_pre sequences = input step_fun = step if not isinstance(self.hid_init, Layer): # repeats self.hid_init num_batch times in first dimension dot_dims = (list(range(1, self.hid_init.ndim - 1)) + [0, self.hid_init.ndim - 1]) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init.dimshuffle(dot_dims)) hid_out = theano.scan(fn=step_fun, sequences=sequences, go_backwards=False, outputs_info=[hid_init], non_sequences=non_seqs, truncate_gradient=-1, strict=True)[0] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) return hid_out
def get_params(self, **tags): # Get all parameters from this layer, the master layer params = super(CustomMIRecurrentLayer, self).get_params(**tags) # Combine with all parameters from the child layers params += helper.get_all_params(self.input_to_hidden, **tags) params += helper.get_all_params(self.hidden_to_hidden, **tags) return params
def get_params(self, **tags): # Get all parameters from this layer, the master layer params = super(CustomRecurrentLayer, self).get_params(**tags) # Combine with all parameters from the child layers params += helper.get_all_params(self.input_to_hidden, **tags) params += helper.get_all_params(self.hidden_to_hidden, **tags) return params
def get_params(self, **tags): # Get all parameters from this layer, the master layer params = super(StochsticRecurrentLayer, self).get_params(**tags) if self.logvar_p_mlp is not None: params += helper.get_all_params(self.logvar_p_mlp, **tags) params += helper.get_all_params(self.q_mu_mlp, **tags) params += helper.get_all_params(self.q_logvar_mlp, **tags) params += helper.get_all_params(self.mu_p_mlp, **tags) return params
def get_params(self, **tags): # Get all parameters from this layer, the master layer params = super(RecurrentUnitaryLayer, self).get_params(**tags) # Combine with all parameters from the child layers params += helper.get_all_params(self.input_to_hidden, **tags) params += helper.get_all_params(self.hidden_to_hidden, **tags) if isinstance(self.nonlinearity, Layer): params += self.nonlinearity.get_params() return params
def get_params(self): """ Get all parameters of this layer. :returns: - params : list of theano.shared List of all parameters """ params = helper.get_all_params(self.input_to_hidden) + helper.get_all_params(self.hidden_to_hidden) if self.learn_init: return params + self.get_init_params() else: return params
def get_params(self): ''' Get all parameters of this layer. :returns: - params : list of theano.shared List of all parameters ''' params = (helper.get_all_params(self.input_to_hidden) + helper.get_all_params(self.hidden_to_hidden)) if self.learn_init: return params + self.get_init_params() else: return params
def _compile_train_func(self): logger.info("Compiling train cost function...") network_input = self.net.symbolic_input() network_output = self.net.symbolic_output(deterministic=False) target_var = ndim_tensor(name='target', ndim=network_output.ndim) mask_var = ndim_tensor(name='mask', ndim=network_output.ndim) loss = self.loss_func(network_output, target_var, mask_var) all_params = get_all_params(self.net.layers[-1], trainable=True) updates = self.updates_func( loss, all_params, learning_rate=self._learning_rate) train_func = theano.function( inputs=[network_input, target_var, mask_var], outputs=loss, updates=updates, on_unused_input='warn', allow_input_downcast=True) logger.info("Done compiling cost function.") return train_func
def __init__(self, *args, **kwargs): super(TrainerMixin, self).__init__(*args, **kwargs) input_var = tensor.tensor4('inputs') target_var = tensor.ivector('targets') loss, _ = loss_acc(self.model, input_var, target_var, deterministic=False) layers = get_all_layers(self.model) decay = regularize_layer_params(layers, l2) * 0.0001 loss = loss + decay params = get_all_params(self.model, trainable=True) updates = momentum(loss, params, momentum=0.9, learning_rate=self.learning_rate) self.set_training(input_var, target_var, loss, updates)
def get_output_for(self, inputs, mask=None, **kwargs): """ Compute this layer's output function given a symbolic input variable. Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied hid_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] # Input should be provided as (n_batch, n_time_steps, n_features) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, *range(2, input.ndim)) seq_len, num_batch = input.shape[0], input.shape[1] if self.precompute_input: # Because the input is given for all time steps, we can precompute # the inputs to hidden before scanning. First we need to reshape # from (seq_len, batch_size, trailing dimensions...) to # (seq_len*batch_size, trailing dimensions...) # This strange use of a generator in a tuple was because # input.shape[2:] was raising a Theano error trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim)) input = T.reshape(input, (seq_len*num_batch,) + trailing_dims) input = helper.get_output( self.input_to_hidden, input, **kwargs) # Reshape back to (seq_len, batch_size, trailing dimensions...) trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim)) input = T.reshape(input, (seq_len, num_batch) + trailing_dims) # We will always pass the hidden-to-hidden layer params to step non_seqs = helper.get_all_params(self.hidden_to_hidden) # When we are not precomputing the input, we also need to pass the # input-to-hidden parameters to step if not self.precompute_input: non_seqs += helper.get_all_params(self.input_to_hidden) # Create single recurrent computation step function def step(input_n, hid_previous, *args): # Compute the hidden-to-hidden activation hid_pre = helper.get_output( self.hidden_to_hidden, hid_previous, **kwargs) # If the dot product is precomputed then add it, otherwise # calculate the input_to_hidden values and add them if self.precompute_input: hid_pre += input_n else: hid_pre += helper.get_output( self.input_to_hidden, input_n, **kwargs) # Clip gradients if self.grad_clipping: hid_pre = theano.gradient.grad_clip( hid_pre, -self.grad_clipping, self.grad_clipping) return self.nonlinearity(hid_pre) def step_masked(input_n, mask_n, hid_previous, *args): # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. hid = step(input_n, hid_previous, *args) hid_out = T.switch(mask_n, hid, hid_previous) return [hid_out] if mask is not None: mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step if not isinstance(self.hid_init, Layer): # The code below simply repeats self.hid_init num_batch times in # its first dimension. Turns out using a dot product and a # dimshuffle is faster than T.repeat. dot_dims = (list(range(1, self.hid_init.ndim - 1)) + [0, self.hid_init.ndim - 1]) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init.dimshuffle(dot_dims)) if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1])[0] else: # Scan op iterates over first dimension of input and repeatedly # applies the step function hid_out = theano.scan( fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def get_params(self, **tags): # Get all parameters from this layer, the master layer params = super(ConvTimeStep1DLayer, self).get_params(**tags) # Combine with all parameters from the child layers params += helper.get_all_params(self.conv1d, **tags) return params
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable. Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] # Input should be provided as (n_batch, n_time_steps, n_features) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, *range(2, input.ndim)) seq_len, num_batch = input.shape[0], input.shape[1] if self.precompute_input: # Because the input is given for all time steps, we can precompute # the inputs to hidden before scanning. First we need to reshape # from (seq_len, batch_size, trailing dimensions...) to # (seq_len*batch_size, trailing dimensions...) # This strange use of a generator in a tuple was because # input.shape[2:] was raising a Theano error trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim)) input = T.reshape(input, (seq_len * num_batch, ) + trailing_dims) input = helper.get_output(self.input_to_hidden, input, **kwargs) # Reshape back to (seq_len, batch_size, trailing dimensions...) trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim)) input = T.reshape(input, (seq_len, num_batch) + trailing_dims) # We will always pass the hidden-to-hidden layer params to step non_seqs = helper.get_all_params(self.hidden_to_hidden) non_seqs += self._get_mi_params() # When we are not precomputing the input, we also need to pass the # input-to-hidden parameters to step if not self.precompute_input: non_seqs += helper.get_all_params(self.input_to_hidden) # Create single recurrent computation step function def step(input_n, hid_previous, *args): # Compute the hidden-to-hidden activation hid_to_hid = helper.get_output(self.hidden_to_hidden, hid_previous, **kwargs) # Compute the input-to-hidden activation if self.precompute_input: # if the input is precomputed in_to_hid = input_n else: # compute the input in_to_hid = helper.get_output(self.input_to_hidden, input_n, **kwargs) # Compute the second order term if self.a_g is not None: second_order_term = (self.a_g * in_to_hid * hid_to_hid) # second_order_term = in_to_hid * hid_to_hid else: second_order_term = 0 # Compute the first order hidden-to-hidden term if self.b_g_hid_to_hid is not None: f_o_hid_to_hid = self.b_g_hid_to_hid * hid_to_hid else: f_o_hid_to_hid = 0 # Compute first order input to hidden term if self.b_g_in_to_hid is not None: f_o_in_to_hid = self.b_g_in_to_hid * in_to_hid else: # if all else is None, it will output zeros of the right size f_o_in_to_hid = T.zeros_like(in_to_hid) hid_pre = second_order_term + f_o_in_to_hid + f_o_hid_to_hid if self.b is not None: hid_pre = hid_pre + self.b return self.nonlinearity(hid_pre) def step_masked(input_n, mask_n, hid_previous, *args): # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. hid = step(input_n, hid_previous, *args) hid_out = T.switch(mask_n, hid, hid_previous) return [hid_out] if mask is not None: mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step if not isinstance(self.hid_init, Layer): # The code below simply repeats self.hid_init num_batch times in # its first dimension. Turns out using a dot product and a # dimshuffle is faster than T.repeat. dot_dims = (list(range(1, self.hid_init.ndim - 1)) + [0, self.hid_init.ndim - 1]) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init.dimshuffle(dot_dims)) if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1])[0] else: # Scan op iterates over first dimension of input and repeatedly # applies the step function hid_out = theano.scan(fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
y = data['target'] P = compute_joint_probabilities(X, batch_size=batch_size, d=2, perplexity=30, tol=1e-5, verbose=0) x = Input((None, X.shape[1])) z = Dense(x, num_units=256, nonlinearity=rectify) z = Dense(z, num_units=2, nonlinearity=linear) z_pred = get_output(z) P_real = T.matrix() loss = tsne_loss(P_real, z_pred) params = get_all_params(z, trainable=True) lr = theano.shared(np.array(0.01, dtype=floatX)) updates = updates.adam(loss, params, learning_rate=lr) train_fn = theano.function([x.input_var, P_real], loss, updates=updates) encode = theano.function([x.input_var], z_pred) X_train = X Y_train = P for epoch in range(1000): total_loss = 0 nb = 0 for xt in iterate_minibatches(X_train, batch_size=batch_size, shuffle=False): yt = Y_train[nb] total_loss += train_fn(xt, yt)
def get_output_for(self, inputs, **kwargs): # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = None hid_init = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] # Input should be provided as (n_batch, n_time_steps, n_features) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) #input = input.dimshuffle(1, 0, *range(2, input.ndim)) seq_len, num_batch = input.shape[0], input.shape[1] # We will always pass the hidden-to-hidden layer params to step non_seqs = helper.get_all_params(self.hidden_to_hidden) # Create single recurrent computation step function def step(input_n, hid_previous, *args): # Compute the hidden-to-hidden activation hid_pre = helper.get_output( self.hidden_to_hidden, hid_previous, **kwargs) hid_pre += input_n # Clip gradients if self.grad_clipping: hid_pre = theano.gradient.grad_clip( hid_pre, -self.grad_clipping, self.grad_clipping) return self.nonlinearity(hid_pre) def step_masked(input_n, mask_n, hid_previous, *args): # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. hid = step(input_n, hid_previous, *args) hid_out = T.switch(mask_n, hid, hid_previous) return [hid_out] if mask is not None: mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step if not isinstance(self.hid_init, Layer): # The code below simply repeats self.hid_init num_batch times in # its first dimension. Turns out using a dot product and a # dimshuffle is faster than T.repeat. dot_dims = (list(range(1, self.hid_init.ndim - 1)) + [0, self.hid_init.ndim - 1]) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init.dimshuffle(dot_dims)) if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan( fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1])[0] else: # Scan op iterates over first dimension of input and repeatedly # applies the step function hid_out = theano.scan( fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True)[0] # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: hid_out = hid_out[-1] else: # dimshuffle back to (n_batch, n_time_steps, n_features)) #hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[::-1,:] return hid_out
def num_trainable_parameters(self): return sum( [p.get_value().size for p in get_all_params(self.layers[-1])])
def get_params(self, **tags): # Get all parameters from this layer, the master layer params = super(PermutationalLayer, self).get_params(**tags) # Combine with all parameters from the child layers params += helper.get_all_params(self.subnet, **tags) return params
def main(num_epochs=10, layers=1, load_file=None, batch_size=128, seq_len=96, suffix='', test=False, model_name='model'): print "Building network ..." print theano.config.floatX BATCH_SIZE = batch_size SEQ_LENGTH = seq_len # Recurrent layers expect input of shape (batch size, SEQ_LENGTH, num_features) x = T.imatrix('x') mask = T.matrix('mask') target_values = T.ivector('target_output') # We now build a layer for the embeddings. U = np.random.randn(vocab_size, char_dims).astype(theano.config.floatX) embeddings = theano.shared(U, name='embeddings', borrow=True) x_embedded = embeddings[x] l_in = lasagne.layers.InputLayer(shape=(BATCH_SIZE, SEQ_LENGTH, char_dims), input_var=x_embedded) l_mask = lasagne.layers.InputLayer(shape=(BATCH_SIZE, SEQ_LENGTH), input_var=mask) recurrent_type = lasagne.layers.LSTMLayer l_forward_1 = recurrent_type(l_in, N_HIDDEN, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh, mask_input=l_mask) l_backward_1 = recurrent_type(l_in, N_HIDDEN, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh, backwards=True, mask_input=l_mask) if layers == 2: l_forward_2 = recurrent_type(l_forward_1, N_HIDDEN, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh, mask_input=l_mask) l_backward_2 = recurrent_type(l_backward_1, N_HIDDEN, grad_clipping=GRAD_CLIP, nonlinearity=lasagne.nonlinearities.tanh, backwards=True, mask_input=l_mask) l_forward_slice = lasagne.layers.get_output(l_forward_2)[:,-1,:] l_backward_slice = lasagne.layers.get_output(l_backward_2)[:,-1,:] else: l_forward_slice = lasagne.layers.get_output(l_forward_1)[:,-1,:] l_backward_slice = lasagne.layers.get_output(l_backward_1)[:,-1,:] # Now combine the LSTM layers. _Wf, _Wb = np.random.randn(N_HIDDEN, dim_out).astype(theano.config.floatX), np.random.randn(N_HIDDEN, dim_out).astype(theano.config.floatX) _bias = np.random.randn(dim_out).astype(theano.config.floatX) wf = theano.shared(_Wf, name='join forward weights', borrow=True) wb = theano.shared(_Wb, name='join backward weights', borrow=True) bias = theano.shared(_bias, name='join bias', borrow=True) joined = T.dot(l_forward_slice, wf) + T.dot(l_backward_slice, wb) + bias tmp = lasagne.layers.InputLayer(shape=(BATCH_SIZE, dim_out)) l_out = lasagne.layers.DenseLayer(tmp, num_units=NUM_TAGS, W = lasagne.init.Normal(), nonlinearity=lasagne.nonlinearities.softmax) # lasagne.layers.get_output produces a variable for the output of the net network_output = l_out.get_output_for(joined) # The loss function is calculated as the mean of the (categorical) cross-entropy between the prediction and target. cost = T.nnet.categorical_crossentropy(network_output,target_values).mean() # Retrieve all parameters from the network if layers == 1: all_params = helper.get_all_params(l_forward_1) + helper.get_all_params(l_backward_1) else: all_params = helper.get_all_params(l_forward_2) + helper.get_all_params(l_backward_2) all_params += helper.get_all_params(l_out) + [wf, wb, bias, embeddings] print len(all_params) grads = T.grad(cost, all_params) get_grads = theano.function([x, mask, target_values], grads) # Compute AdaGrad updates for training print("Computing updates ...") updates = lasagne.updates.adam(cost, all_params) # Theano functions for training and computing cost print("Compiling functions ...") train = theano.function([x, mask, target_values], cost, updates=updates, allow_input_downcast=True) compute_cost = theano.function([x, mask, target_values], cost, allow_input_downcast=True) pred = T.argmax(network_output, axis=1) get_preds = theano.function([x, mask], pred, allow_input_downcast=True) errors = T.sum(T.neq(pred, target_values)) count_errors = theano.function([x, mask, target_values], errors, allow_input_downcast=True) def get_data(fname): import cPickle with open(fname, 'rb') as handle: data = cPickle.load(handle) xs = [d.astype('int32') for d in data[0]] return xs, data[1] print 'Loading train' train_xs, train_ys = get_data('train%s' % suffix) print 'Loading dev' dev_xs, dev_ys = get_data('dev%s' % suffix) print 'Loading test' test_xs, test_ys = get_data('test%s' % suffix) print 'Sizes:\tTrain: %d\tDev: %d\tTest: %d\n' % (len(train_xs) * BATCH_SIZE, len(dev_xs) * BATCH_SIZE, len(test_xs) * BATCH_SIZE) def get_accuracy(pXs, pYs): total = sum([len(batch) for batch in pXs]) errors = sum([count_errors(tx, get_mask(tx), ty) for tx, ty in zip(pXs, pYs)]) return float(total-errors)/total def save_preds(pXs, pYs): preds = [get_preds(tx, get_mask(tx)) for tx, _ in zip(pXs, pYs)] with open('pred.pkl', 'wb') as handle: handle.dump(preds, handle) if not load_file is None: print 'Loading params...' with open(load_file, 'rb') as handle: params = cPickle.load(handle) print len(params) for ix, _ in enumerate(zip(params, all_params)): all_params[ix].set_value(params[ix].astype('float32')) print("Training ...") try: if test: dev_acc = get_accuracy(dev_xs, dev_ys) save_preds(dev_xs, dev_ys) print dev_acc return best_acc = 0.0 for it in xrange(num_epochs): data = zip(train_xs, train_ys) random.shuffle(data) train_xs, train_ys = zip(*data) avg_cost = 0; total = 0. for x, y in zip(train_xs, train_ys): avg_cost += train(x, get_mask(x), y) total += 1. train_acc = 0. #train_acc = get_accuracy(train_xs, train_ys) dev_acc = get_accuracy(dev_xs, dev_ys) test_acc = get_accuracy(test_xs, test_ys) if dev_acc > best_acc: params = [np.asarray(p.eval()) for p in all_params] with open('%s_%f.pkl' % (model_name, dev_acc), 'wb') as handle: cPickle.dump(params, handle) best_acc = dev_acc print("Epoch {} average loss = {}".format(it, avg_cost / total)) print "Accuracies:\t train: %f\tdev: %f\ttest: %f\n" % (train_acc, dev_acc, test_acc) print except KeyboardInterrupt: pass
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable. Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input = inputs[0] # Retrieve the mask when it is supplied mask = inputs[1] if len(inputs) > 1 else None # Input should be provided as (n_batch, n_time_steps, n_features) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, *range(2, input.ndim)) seq_len, num_batch = input.shape[0], input.shape[1] if self.precompute_input: # Because the input is given for all time steps, we can precompute # the inputs to hidden before scanning. First we need to reshape # from (seq_len, batch_size, trailing dimensions...) to # (seq_len*batch_size, trailing dimensions...) # This strange use of a generator in a tuple was because # input.shape[2:] was raising a Theano error trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim)) input = T.reshape(input, (seq_len * num_batch, ) + trailing_dims) input = helper.get_output(self.input_to_hidden, input, **kwargs) # Reshape back to (seq_len, batch_size, trailing dimensions...) trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim)) input = T.reshape(input, (seq_len, num_batch) + trailing_dims) # We will always pass the hidden-to-hidden layer params to step non_seqs = helper.get_all_params(self.hidden_to_hidden) non_seqs += helper.get_all_params(self.output_to_hidden) # When we are not precomputing the input, we also need to pass the # input-to-hidden parameters to step if not self.precompute_input: non_seqs += helper.get_all_params(self.input_to_hidden) # Create single recurrent computation step function def step(input_n, hid_previous, *args): # Compute the hidden-to-hidden activation hid_pre = helper.get_output(self.hidden_to_hidden, hid_previous, **kwargs) # out_layers = helper.get_all_layers(self.output_to_hidden) # out_layers[1].incoming_layer = self.hidden_to_hidden hid_pre += helper.get_output(self.output_to_hidden, hid_previous, **kwargs) # If the dot product is precomputed then add it, otherwise # calculate the input_to_hidden values and add them if self.precompute_input: hid_pre += input_n else: hid_pre += helper.get_output(self.input_to_hidden, input_n, **kwargs) # Clip gradients if self.grad_clipping: hid_pre = theano.gradient.grad_clip(hid_pre, -self.grad_clipping, self.grad_clipping) return self.nonlinearity(hid_pre) def step_masked(input_n, mask_n, hid_previous, *args): # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. hid = step(input_n, hid_previous, *args) hid_out = hid * mask_n + hid_previous * (1 - mask_n) return [hid_out] if mask is not None: mask = mask.dimshuffle(1, 0, 'x') sequences = [input, mask] step_fun = step_masked else: sequences = input step_fun = step # When hid_init is provided as a TensorVariable, use it as-is if isinstance(self.hid_init, T.TensorVariable): hid_init = self.hid_init else: # The code below simply repeats self.hid_init num_batch times in # its first dimension. Turns out using a dot product and a # dimshuffle is faster than T.repeat. dot_dims = (list(range(1, self.hid_init.ndim - 1)) + [0, self.hid_init.ndim - 1]) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init.dimshuffle(dot_dims)) if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan hid_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[hid_init], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1])[0] else: # Scan op iterates over first dimension of input and repeatedly # applies the step function hid_out = theano.scan(fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[hid_init], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True)[0] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) # if scan is backward reverse the output if self.backwards: hid_out = hid_out[:, ::-1] return hid_out
def get_output_for(self, inputs, **kwargs): """ Compute this layer's output function given a symbolic input variable Parameters ---------- inputs : list of theano.TensorType `inputs[0]` should always be the symbolic input variable. When this layer has a mask input (i.e. was instantiated with `mask_input != None`, indicating that the lengths of sequences in each batch vary), `inputs` should have length 2, where `inputs[1]` is the `mask`. The `mask` should be supplied as a Theano variable denoting whether each time step in each sequence in the batch is part of the sequence or not. `mask` should be a matrix of shape ``(n_batch, n_time_steps)`` where ``mask[i, j] = 1`` when ``j <= (length of sequence i)`` and ``mask[i, j] = 0`` when ``j > (length of sequence i)``. When the hidden state of this layer is to be pre-filled (i.e. was set to a :class:`Layer` instance) `inputs` should have length at least 2, and `inputs[-1]` is the hidden state to prefill with. Returns ------- layer_output : theano.TensorType Symbolic output variable. """ # Retrieve the layer input input_p = inputs[0] input_q = inputs[1] z_init = inputs[2] mu_p_init = inputs[3] # Retrieve the mask when it is supplied mask = None if self.mask_incoming_index > 0: mask = inputs[self.mask_incoming_index] # Because scan iterates over the first dimension we dimshuffle to # (n_time_steps, n_batch, n_features) input_p = input_p.dimshuffle(1, 0, 2) input_q = input_q.dimshuffle(1, 0, 2) seq_len, num_batch, _ = input_p.shape # Create single recurrent computation step function # input__n is the n'th vector of the input def log_sum_exp(a, b): return T.log(T.exp(a) + T.exp(b)) def step(noise_n, input_p_n, input_q_n, z_previous, mu_p_previous, logvar_p_previous, mu_q_previous, logvar_q_previous, *args): input_p = T.concatenate([input_p_n, z_previous], axis=1) mu_p = get_output(self.mu_p_mlp, input_p) logvar_p = get_output(self.logvar_p_mlp, input_p) logvar_p = T.log(T.exp(logvar_p) + self.cons) q_input_n = T.concatenate([input_q_n, z_previous], axis=1) mu_q = get_output(self.q_mu_mlp, q_input_n) if self.use_mu_residual_q: print "Using residuals for mean_q" mu_q += mu_p logvar_q = get_output(self.q_logvar_mlp, q_input_n) # Numerical stability logvar_q = T.log(T.exp(logvar_q) + self.cons) z_n = mu_q + T.exp(0.5 * logvar_q) * noise_n return z_n, mu_p, logvar_p, mu_q, logvar_q def step_masked(noise_n, input_p_n, input_q_n, mask_n, z_previous, mu_p_previous, logvar_p_previous, mu_q_previous, logvar_q_previous, *args): # Skip over any input with mask 0 by copying the previous # hidden state; proceed normally for any input with mask 1. z_n, mu_p, logvar_p, mu_q, logvar_q = step( noise_n, input_p_n, input_q_n, z_previous, mu_p_previous, logvar_p_previous, mu_q_previous, logvar_q_previous, *args) z_n = T.switch(mask_n, z_n, z_previous) mu_p = T.switch(mask_n, mu_p, mu_p_previous) logvar_p = T.switch(mask_n, logvar_p, logvar_p_previous) mu_q = T.switch(mask_n, mu_q, mu_q_previous) logvar_q = T.switch(mask_n, logvar_q, logvar_q_previous) return z_n, mu_p, logvar_p, mu_q, logvar_q eps = self._srng.normal(size=(seq_len, num_batch, self.num_units), avg=0.0, std=1.0) logvar_init = T.zeros((num_batch, self.num_units)) if mask is not None: # mask is given as (batch_size, seq_len). Because scan iterates # over first dimension, we dimshuffle to (seq_len, batch_size) and # add a broadcastable dimension mask = mask.dimshuffle(1, 0, 'x') sequences = [eps, input_p, input_q, mask] step_fun = step_masked else: sequences = [eps, input_p, input_q] step_fun = step # The hidden-to-hidden weight matrix is always used in step non_seqs = helper.get_all_params(self.logvar_p_mlp) non_seqs += helper.get_all_params(self.mu_p_mlp) non_seqs += helper.get_all_params(self.q_mu_mlp) non_seqs += helper.get_all_params(self.q_logvar_mlp) if self.unroll_scan: # Retrieve the dimensionality of the incoming layer input_shape = self.input_shapes[0] # Explicitly unroll the recurrence instead of using scan scan_out = unroll_scan(fn=step_fun, sequences=sequences, outputs_info=[ z_init, mu_p_init, logvar_init, mu_p_init, logvar_init ], go_backwards=self.backwards, non_sequences=non_seqs, n_steps=input_shape[1]) else: # Scan op iterates over first dimension of input and repeatedly # applies the step function scan_out = theano.scan(fn=step_fun, sequences=sequences, go_backwards=self.backwards, outputs_info=[ z_init, mu_p_init, logvar_init, mu_p_init, logvar_init ], non_sequences=non_seqs, truncate_gradient=self.gradient_steps, strict=True)[0] z, mu_p, logvar_p, mu_q, logvar_q = scan_out # When it is requested that we only return the final sequence step, # we need to slice it out immediately after scan is applied if self.only_return_final: assert False else: # dimshuffle back to (n_batch, n_time_steps, n_features)) z = z.dimshuffle(1, 0, 2) mu_p = mu_p.dimshuffle(1, 0, 2) logvar_p = logvar_p.dimshuffle(1, 0, 2) mu_q = mu_q.dimshuffle(1, 0, 2) logvar_q = logvar_q.dimshuffle(1, 0, 2) # if scan is backward reverse the output if self.backwards: z = z[:, ::-1] mu_p = mu_p[:, ::-1] logvar_p = logvar_p[:, ::-1] mu_q = mu_q[:, ::-1] logvar_q = logvar_q[:, ::-1] return z, mu_p, logvar_p, mu_q, logvar_q
def get_output_for(self, inputs, **kwargs): input = inputs[0] hid_init = None if self.hid_init_incoming_index > 0: hid_init = inputs[self.hid_init_incoming_index] # Input should be provided as (n_batch, n_time_steps, n_features) # but scan requires the iterable dimension to be first # So, we need to dimshuffle to (n_time_steps, n_batch, n_features) input = input.dimshuffle(1, 0, *range(2, input.ndim)) seq_len, num_batch = input.shape[0], input.shape[1] # precompute inputs before scanning trailing_dims = tuple(input.shape[n] for n in range(2, input.ndim)) input = T.reshape(input, (seq_len*num_batch,) + trailing_dims) input = helper.get_output( self.input_to_hidden, input, **kwargs) # Reshape back to (seq_len, batch_size, trailing dimensions...) trailing_dims = tuple(input.shape[n] for n in range(1, input.ndim)) input = T.reshape(input, (seq_len, num_batch) + trailing_dims) # pass params to step non_seqs = helper.get_all_params(self.hidden_to_hidden) non_seqs += helper.get_all_params(self.post_concat) # Create single recurrent computation step function def step(input_n, hid_previous, *args): # Compute the hidden-to-hidden activation hid_pre = helper.get_output( self.hidden_to_hidden, hid_previous, **kwargs) hid_pre = T.concatenate([hid_pre, input_n], axis=1) hid_pre = helper.get_output(self.post_concat, hid_pre, **kwargs) if self.grad_clipping: hid_pre = theano.gradient.grad_clip( hid_pre, -self.grad_clipping, self.grad_clipping) return hid_pre sequences = input step_fun = step if not isinstance(self.hid_init, Layer): # repeats self.hid_init num_batch times in first dimension dot_dims = (list(range(1, self.hid_init.ndim - 1)) + [0, self.hid_init.ndim - 1]) hid_init = T.dot(T.ones((num_batch, 1)), self.hid_init.dimshuffle(dot_dims)) hid_out = theano.scan( fn=step_fun, sequences=sequences, go_backwards=False, outputs_info=[hid_init], non_sequences=non_seqs, truncate_gradient=-1, strict=True)[0] # dimshuffle back to (n_batch, n_time_steps, n_features)) hid_out = hid_out.dimshuffle(1, 0, *range(2, hid_out.ndim)) return hid_out