class FeedbackRNN(BaseRecurrent): def __init__(self, dim, **kwargs): super(FeedbackRNN, self).__init__(**kwargs) self.dim = dim self.first_recurrent_layer = SimpleRecurrent( dim=self.dim, activation=Identity(), name='first_recurrent_layer', weights_init=initialization.Identity()) self.second_recurrent_layer = SimpleRecurrent( dim=self.dim, activation=Identity(), name='second_recurrent_layer', weights_init=initialization.Identity()) self.children = [ self.first_recurrent_layer, self.second_recurrent_layer ] @recurrent(sequences=['inputs'], contexts=[], states=['first_states', 'second_states'], outputs=['first_states', 'second_states']) def apply(self, inputs, first_states=None, second_states=None): first_h = self.first_recurrent_layer.apply(inputs=inputs, states=first_states + second_states, iterate=False) second_h = self.second_recurrent_layer.apply(inputs=first_h, states=second_states, iterate=False) return first_h, second_h def get_dim(self, name): return (self.dim if name in ('inputs', 'first_states', 'second_states') else super(FeedbackRNN, self).get_dim(name))
def example(): """ Simple reccurent example. Taken from : https://github.com/mdda/pycon.sg-2015_deep-learning/blob/master/ipynb/blocks-recurrent-docs.ipynb """ x = tensor.tensor3('x') rnn = SimpleRecurrent(dim=3, activation=Identity(), weights_init=initialization.Identity()) rnn.initialize() h = rnn.apply(x) f = theano.function([x], h) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) doubler = Linear( input_dim=3, output_dim=3, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() h_doubler = rnn.apply(doubler.apply(x)) f = theano.function([x], h_doubler) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) #Initial State h0 = tensor.matrix('h0') h = rnn.apply(inputs=x, states=h0) f = theano.function([x, h0], h) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
class FeedbackRNN(BaseRecurrent): def __init__(self, dim, **kwargs): super(FeedbackRNN, self).__init__(**kwargs) self.dim = dim self.first_recurrent_layer = SimpleRecurrent( dim=self.dim, activation=Identity(), name='first_recurrent_layer', weights_init=initialization.Identity()) self.second_recurrent_layer = SimpleRecurrent( dim=self.dim, activation=Identity(), name='second_recurrent_layer', weights_init=initialization.Identity()) self.children = [self.first_recurrent_layer, self.second_recurrent_layer] @recurrent(sequences=['inputs'], contexts=[], states=['first_states', 'second_states'], outputs=['first_states', 'second_states']) def apply(self, inputs, first_states=None, second_states=None): first_h = self.first_recurrent_layer.apply( inputs=inputs, states=first_states + second_states, iterate=False) second_h = self.second_recurrent_layer.apply( inputs=first_h, states=second_states, iterate=False) return first_h, second_h def get_dim(self, name): return (self.dim if name in ('inputs', 'first_states', 'second_states') else super(FeedbackRNN, self).get_dim(name))
def rnn_layer(in_size, dim, x, h, n, first_layer=False): if connect_h_to_h == 'all-previous': if first_layer: rnn_input = x linear = Linear(input_dim=in_size, output_dim=dim, name='linear' + str(n) + '-' + str(task)) elif connect_x_to_h: rnn_input = T.concatenate([x] + [hidden for hidden in h], axis=2) linear = Linear(input_dim=in_size + dim * n, output_dim=dim, name='linear' + str(n) + '-' + str(task)) else: rnn_input = T.concatenate([hidden for hidden in h], axis=2) linear = Linear(input_dim=dim * n, output_dim=dim, name='linear' + str(n) + '-' + str(task)) elif connect_h_to_h == 'two-previous': if first_layer: rnn_input = x linear = Linear(input_dim=in_size, output_dim=dim, name='linear' + str(n) + '-' + str(task)) elif connect_x_to_h: rnn_input = T.concatenate([x] + h[max(0, n - 2):n], axis=2) linear = Linear(input_dim=in_size + dim * 2 if n > 1 else in_size + dim, output_dim=dim, name='linear' + str(n) + '-' + str(task)) else: rnn_input = T.concatenate(h[max(0, n - 2):n], axis=2) linear = Linear(input_dim=dim * 2 if n > 1 else dim, output_dim=dim, name='linear' + str(n) + '-' + str(task)) elif connect_h_to_h == 'one-previous': if first_layer: rnn_input = x linear = Linear(input_dim=in_size, output_dim=dim, name='linear' + str(n) + '-' + str(task)) elif connect_x_to_h: rnn_input = T.concatenate([x] + [h[n - 1]], axis=2) linear = Linear(input_dim=in_size + dim, output_dim=dim, name='linear' + str(n) + '-' + str(task)) else: rnn_input = h[n] linear = Linear(input_dim=dim, output_dim=dim, name='linear' + str(n) + '-' + str(task)) rnn = SimpleRecurrent(dim=dim, activation=Tanh(), name=layer_models[n] + str(n) + '-' + str(task)) initialize([linear, rnn]) if layer_models[n] == 'rnn': return rnn.apply(linear.apply(rnn_input)) elif layer_models[n] == 'mt_rnn': return rnn.apply(linear.apply(rnn_input), time_scale=layer_resolutions[n], time_offset=layer_execution_time_offset[n])
def example(): """ Simple reccurent example. Taken from : https://github.com/mdda/pycon.sg-2015_deep-learning/blob/master/ipynb/blocks-recurrent-docs.ipynb """ x = tensor.tensor3('x') rnn = SimpleRecurrent(dim=3, activation=Identity(), weights_init=initialization.Identity()) rnn.initialize() h = rnn.apply(x) f = theano.function([x], h) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) doubler = Linear(input_dim=3, output_dim=3, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() h_doubler = rnn.apply(doubler.apply(x)) f = theano.function([x], h_doubler) print(f(np.ones((3, 1, 3), dtype=theano.config.floatX))) #Initial State h0 = tensor.matrix('h0') h = rnn.apply(inputs=x, states=h0) f = theano.function([x, h0], h) print( f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
def test_similar_scans(): x = tensor.tensor3('x') r1 = SimpleRecurrent(activation=Tanh(), dim=10) y1 = r1.apply(x) r2 = SimpleRecurrent(activation=Tanh(), dim=10) y2 = r2.apply(x) cg = ComputationGraph([y1, y2]) assert len(cg.scans) == 2
class RNNwMini(BaseRecurrent): def __init__(self, dim, mini_dim, summary_dim, **kwargs): super(RNNwMini, self).__init__(**kwargs) self.dim = dim self.mini_dim = mini_dim self.summary_dim = summary_dim self.recurrent_layer = SimpleRecurrent( dim=self.summary_dim, activation=Rectifier(), name='recurrent_layer', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) self.mini_recurrent_layer = SimpleRecurrent( dim=self.mini_dim, activation=Rectifier(), name='mini_recurrent_layer', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) self.mini_to_main = Linear(self.dim + self.mini_dim, self.summary_dim, name='mini_to_main', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) self.children = [ self.recurrent_layer, self.mini_recurrent_layer, self.mini_to_main ] @recurrent(sequences=['x', 'xmini'], contexts=[], states=['states'], outputs=['states']) def apply(self, x, xmini, states=None): mini_h_all = self.mini_recurrent_layer.apply(inputs=xmini, states=None, iterate=True) #grab last hidden state mini_h = mini_h_all[-1] combInput = T.concatenate([x, mini_h], axis=1) combTransform = self.mini_to_main.apply(combInput) h = self.recurrent_layer.apply(inputs=combTransform, states=states, iterate=False) return h def get_dim(self, name): dim = 1 if name == 'x': dim = self.dim elif name == 'states': dim = self.summary_dim else: dim = super(RNNwMini, self).get_dim(name) return dim
def rnn_layer(dim, h, n, x_mask, first, **kwargs): linear = Linear(input_dim=dim, output_dim=dim, name='linear' + str(n)) rnn = SimpleRecurrent(dim=dim, activation=Rectifier(), name='rnn' + str(n)) initialize([linear, rnn]) applyLin = linear.apply(h) if first: rnnApply = rnn.apply(applyLin, mask=x_mask, **kwargs) else: rnnApply = rnn.apply(applyLin, **kwargs) return rnnApply
class TestSimpleRecurrent(unittest.TestCase): def setUp(self): self.simple = SimpleRecurrent(dim=3, weights_init=Constant(2), activation=Tanh()) self.simple.initialize() def test_one_step(self): h0 = tensor.matrix('h0') x = tensor.matrix('x') mask = tensor.vector('mask') h1 = self.simple.apply(x, h0, mask=mask, iterate=False) next_h = theano.function(inputs=[h0, x, mask], outputs=[h1]) h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=theano.config.floatX) x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]], dtype=theano.config.floatX) mask_val = numpy.array([1, 0]).astype(theano.config.floatX) h1_val = numpy.tanh(h0_val.dot(2 * numpy.ones((3, 3))) + x_val) h1_val = mask_val[:, None] * h1_val + (1 - mask_val[:, None]) * h0_val assert_allclose(h1_val, next_h(h0_val, x_val, mask_val)[0]) def test_many_steps(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') h = self.simple.apply(x, mask=mask, iterate=True) calc_h = theano.function(inputs=[x, mask], outputs=[h]) x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = numpy.ones( (24, 4, 3), dtype=theano.config.floatX) * x_val[..., None] mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX) for i in range(1, 25): h_val[i] = numpy.tanh(h_val[i - 1].dot(2 * numpy.ones((3, 3))) + x_val[i - 1]) h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) h_val = h_val[1:] assert_allclose(h_val, calc_h(x_val, mask_val)[0], rtol=1e-04) # Also test that initial state is a parameter initial_state, = VariableFilter(roles=[INITIAL_STATE])( ComputationGraph(h)) assert is_shared_variable(initial_state) assert initial_state.name == 'initial_state'
class MyRnn(BaseRecurrent): # Extend the base recurrent class to create one of your own def __init__(self, dim, **kwargs): super(MyRnn, self).__init__(**kwargs) self.dim = dim self.layer1 = SimpleRecurrent(dim=self.dim, activation=Identity(), name='recurrent layer 1', weights_init=initialization.Identity()) self.layer2 = SimpleRecurrent(dim=self.dim, activation=Identity(), name='recurrent layer 2', weights_init=initialization.Identity()) self.children = [self.layer1, self.layer2] def apply(self, inputs, first_states=None, second_states=None): first_h = self.layer1.apply(inputs=inputs, states=first_states, iterate=False) second_h = self.layer2.apply(inputs=first_h, states=second_states, iterate=False) return first_h, second_h def get_dim(self): pass
class TestSimpleRecurrent(unittest.TestCase): def setUp(self): self.simple = SimpleRecurrent(dim=3, weights_init=Constant(2), activation=Tanh()) self.simple.initialize() def test_one_step(self): h0 = tensor.matrix('h0') x = tensor.matrix('x') mask = tensor.vector('mask') h1 = self.simple.apply(x, h0, mask=mask, iterate=False) next_h = theano.function(inputs=[h0, x, mask], outputs=[h1]) h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=theano.config.floatX) x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]], dtype=theano.config.floatX) mask_val = numpy.array([1, 0]).astype(theano.config.floatX) h1_val = numpy.tanh(h0_val.dot(2 * numpy.ones((3, 3))) + x_val) h1_val = mask_val[:, None] * h1_val + (1 - mask_val[:, None]) * h0_val assert_allclose(h1_val, next_h(h0_val, x_val, mask_val)[0]) def test_many_steps(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') h = self.simple.apply(x, mask=mask, iterate=True) calc_h = theano.function(inputs=[x, mask], outputs=[h]) x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = numpy.ones((24, 4, 3), dtype=theano.config.floatX) * x_val[..., None] mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX) for i in range(1, 25): h_val[i] = numpy.tanh(h_val[i - 1].dot( 2 * numpy.ones((3, 3))) + x_val[i - 1]) h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) h_val = h_val[1:] assert_allclose(h_val, calc_h(x_val, mask_val)[0], rtol=1e-04) # Also test that initial state is a parameter initial_state, = VariableFilter(roles=[INITIAL_STATE])( ComputationGraph(h)) assert is_shared_variable(initial_state) assert initial_state.name == 'initial_state'
class TestBidirectional(unittest.TestCase): def setUp(self): self.bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent( dim=3, activation=Tanh())) self.simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(), activation=Tanh(), seed=1) self.bidir.allocate() self.simple.initialize() self.bidir.children[0].params[0].set_value( self.simple.params[0].get_value()) self.bidir.children[1].params[0].set_value( self.simple.params[0].get_value()) self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=floatX) self.x_val = (numpy.ones((24, 4, 3), dtype=floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=floatX) self.mask_val[12:24, 3] = 0 def test(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_bidir = theano.function([x, mask], [self.bidir.apply(x, mask=mask)]) calc_simple = theano.function([x, mask], [self.simple.apply(x, mask=mask)]) h_bidir = calc_bidir(self.x_val, self.mask_val)[0] h_simple = calc_simple(self.x_val, self.mask_val)[0] h_simple_rev = calc_simple(self.x_val[::-1], self.mask_val[::-1])[0] assert_allclose(h_simple, h_bidir[..., :3], rtol=1e-04) assert_allclose(h_simple_rev, h_bidir[::-1, ..., 3:], rtol=1e-04)
def rnn_layer(in_dim, h, h_dim, n): linear = Linear(input_dim=in_dim, output_dim=h_dim, name='linear' + str(n) + h.name) rnn = SimpleRecurrent(dim=h_dim, name='rnn' + str(n)) initialize([linear, rnn]) return rnn.apply(linear.apply(h))
class CompositionalLayerToyWithTables(Initializable): def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size, subword_RNN_hidden_state_size, **kwargs): super(CompositionalLayerToyWithTables, self).__init__(**kwargs) self.batch_size = batch_size self.num_subwords = num_subwords # number of subwords which make up a word self.num_words = num_words # number of words in the sentence self.subword_embedding_size = subword_embedding_size self.input_vocab_size = input_vocab_size self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size # create the look up table self.lookup = LookupTable(length=self.input_vocab_size, dim=self.subword_embedding_size, name='input_lookup') self.lookup.weights_init = Uniform(width=0.08) self.lookup.biases_init = Constant(0) # has one RNN which reads the subwords into a word embedding self.compositional_subword_to_word_RNN = SimpleRecurrent( dim=self.subword_RNN_hidden_state_size, activation=Identity(), name='subword_RNN', weights_init=Identity_init()) self.children = [self.lookup, self.compositional_subword_to_word_RNN] ''' subword_id_input_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size). It is expected as a dtype=uint16 or equivalent subword_id_input_mask_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size). It is expected as a dtype=uint8 or equivalent and has binary values of 1 when there is data and zero otherwise. The look up table will return a 4d tensor with shape = (num_words, num_subwords, batch_size, embedding size) The RNN will eat up the subwords dimension, resulting in a 3d tensor of shape = (num_words, batch_size, RNN_hidden_value_size), which is returned as 'word_embeddings' Also returned is a 2d tensor of shape = (num_words, batch_zize), which is the remaining mask indicated the length of the sentence for each sentence in the batch. i.e., 1 when there is a word, 0 otherwise. ''' @application(inputs=['subword_id_input_', 'subword_id_input_mask_'], outputs=['word_embeddings', 'word_embeddings_mask']) def apply(self, subword_id_input_, subword_id_input_mask_): ##shape = (num_words, num_subwords, batch_size, embedding size) subword_embeddings = self.lookup.apply(subword_id_input_) result, updates = theano.scan( #loop over each word and have the rnn eat up the subwords fn=lambda subword_embeddings, subword_id_input_mask_: self.compositional_subword_to_word_RNN.apply(subword_embeddings, mask=subword_id_input_mask_), sequences= [subword_embeddings, subword_id_input_mask_]) word_embeddings = result.dimshuffle(1,0,2,3) #put the states as the last dimension #remove this line to see the RNN states word_embeddings = word_embeddings[-1] #take only the last state, since we dont need the others #remove subword dim from mask #if subword is empty then word is emptry the word is emptry, if not then the word is used word_embeddings_mask = subword_id_input_mask_.max(axis=1) return word_embeddings, word_embeddings_mask
def rnn_layer(in_dim, h, h_dim, n, pref=""): linear = Linear(input_dim=in_dim, output_dim=h_dim, name='linear' + str(n) + pref) rnn = SimpleRecurrent(dim=h_dim, activation=Tanh(), name='rnn' + str(n) + pref) initialize([linear, rnn]) return rnn.apply(linear.apply(h))
def test_saved_inner_graph(): """Make sure that the original inner graph is saved.""" x = tensor.tensor3() recurrent = SimpleRecurrent(dim=3, activation=Tanh()) y = recurrent.apply(x) application_call = get_application_call(y) assert application_call.inner_inputs assert application_call.inner_outputs cg = ComputationGraph(application_call.inner_outputs) # Check that the inner scan graph is annotated # with `recurrent.apply` assert len(VariableFilter(application=recurrent.apply)(cg)) == 3 # Check that the inner graph is equivalent to the one # produced by a stand-alone of `recurrent.apply` assert is_same_graph(application_call.inner_outputs[0], recurrent.apply(*application_call.inner_inputs, iterate=False))
class LanguageModelToy(Initializable): """ This takes the word embeddings from CompositionalLayerToyWithTables and creates sentence embeddings Input is a 3d tensor with the dimensions of (num_words, num_subwords, batch_size) and a 3d tensor a mask of size (num_words, num_subwords, batch_size) All hidden state sizes are the same as the subword embedding size This returns a 3d tensor with dimenstions of (num_words = num RNN states, batch_size, sentence embedding size) """ def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size, subword_RNN_hidden_state_size, LM_RNN_hidden_state_size, **kwargs): super(LanguageModelToy, self).__init__(**kwargs) self.batch_size = batch_size self.num_subwords = num_subwords # number of subwords which make up a word self.num_words = num_words # number of words in the sentence self.subword_embedding_size = subword_embedding_size self.input_vocab_size = input_vocab_size self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size self.LM_RNN_hidden_state_size = LM_RNN_hidden_state_size self.compositional_layer = CompositionalLayerToyWithTables(self.batch_size, self.num_subwords, self.num_words, self.subword_embedding_size, self.input_vocab_size, self.subword_RNN_hidden_state_size, name='compositional_layer') # has one RNN which reads the word embeddings into a sentence embedding self.language_model_RNN = SimpleRecurrent( dim=self.LM_RNN_hidden_state_size, activation=Identity(), name='language_model_RNN', weights_init=Identity_init()) self.children = [self.compositional_layer, self.language_model_RNN] @application(inputs=['subword_id_input_', 'subword_id_input_mask_'], outputs=['sentence_embeddings', 'sentence_embeddings_mask']) def apply(self, subword_id_input_, subword_id_input_mask_): """ subword_id_input_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size). It is expected as a dtype=uint16 or equivalent subword_id_input_mask_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size). It is expected as a dtype=uint8 or equivalent and has binary values of 1 when there is data and zero otherwise. Returned is a 3d tensor of size (num_words = num RNN states, batch_size, sentence embedding size) Also returned is a 1d tensor of size (batch_size) describing if the sentence is valid of empty in the batch """ word_embeddings, word_embeddings_mask = self.compositional_layer.apply(subword_id_input_, subword_id_input_mask_) sentence_embeddings = self.language_model_RNN.apply(word_embeddings, mask=word_embeddings_mask) sentence_embeddings_mask = word_embeddings_mask.max(axis=0).T return sentence_embeddings, sentence_embeddings_mask
def example5(): """Bidir + simplereccurent. Adaptation from a unittest in blocks """ bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent( dim=3, activation=Tanh())) simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(), activation=Tanh(), seed=1) bidir.allocate() simple.initialize() bidir.children[0].parameters[0].set_value( simple.parameters[0].get_value()) bidir.children[1].parameters[0].set_value( simple.parameters[0].get_value()) #Initialize theano variables and functions x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_bidir = theano.function([x, mask], [bidir.apply(x, mask=mask)]) calc_simple = theano.function([x, mask], [simple.apply(x, mask=mask)]) #Testing time x_val = 0.1 * np.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = (np.ones((24, 4, 3), dtype=theano.config.floatX) * x_val[..., None]) mask_val = np.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_bidir = calc_bidir(x_val, mask_val)[0] h_simple = calc_simple(x_val, mask_val)[0] h_simple_rev = calc_simple(x_val[::-1], mask_val[::-1])[0] print(h_bidir) print(h_simple) print(h_simple_rev)
def example5(): """Bidir + simplereccurent. Adaptation from a unittest in blocks """ bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent(dim=3, activation=Tanh())) simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(), activation=Tanh(), seed=1) bidir.allocate() simple.initialize() bidir.children[0].parameters[0].set_value(simple.parameters[0].get_value()) bidir.children[1].parameters[0].set_value(simple.parameters[0].get_value()) #Initialize theano variables and functions x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_bidir = theano.function([x, mask], [bidir.apply(x, mask=mask)]) calc_simple = theano.function([x, mask], [simple.apply(x, mask=mask)]) #Testing time x_val = 0.1 * np.asarray(list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = (np.ones( (24, 4, 3), dtype=theano.config.floatX) * x_val[..., None]) mask_val = np.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_bidir = calc_bidir(x_val, mask_val)[0] h_simple = calc_simple(x_val, mask_val)[0] h_simple_rev = calc_simple(x_val[::-1], mask_val[::-1])[0] print(h_bidir) print(h_simple) print(h_simple_rev)
class TextRNN(object): def __init__(self, dim_in, dim_hidden, dim_out, **kwargs): self.dim_in = dim_in self.dim_hidden = dim_hidden self.dim_out = dim_out self.input_layer = Linear(input_dim=self.dim_in, output_dim=self.dim_hidden, weights_init=initialization.IsotropicGaussian(), biases_init=initialization.Constant(0)) self.input_layer.initialize() sparse_init = initialization.Sparse(num_init=15, weights_init=initialization.IsotropicGaussian()) self.recurrent_layer = SimpleRecurrent( dim=self.dim_hidden, activation=Tanh(), name="first_recurrent_layer", weights_init=sparse_init, biases_init=initialization.Constant(0.01)) ''' self.recurrent_layer = LSTM(dim=self.dim_hidden, activation=Tanh(), weights_init=initialization.IsotropicGaussian(std=0.001), biases_init=initialization.Constant(0.01)) ''' self.recurrent_layer.initialize() self.output_layer = Linear(input_dim=self.dim_hidden, output_dim=self.dim_out, weights_init=initialization.Uniform(width=0.01), biases_init=initialization.Constant(0.01)) self.output_layer.initialize() self.children = [self.input_layer, self.recurrent_layer, self.output_layer] ''' @recurrent(sequences=['inputs'], states=['states'], contexts=[], outputs=['states', 'output']) ''' def run(self, inputs): output = self.output_layer.apply( self.recurrent_layer.apply(self.input_layer.apply(inputs)) ) return output
# Computational Graph input = T.tensor3('input') mask = T.fmatrix('mask') target = T.tensor3('target') linear1 = Linear(name='linear1', input_dim=300, output_dim=128) recurrent = SimpleRecurrent(name='recurrent', activation=Tanh(), dim=128) linear2 = Linear(name='linear2', input_dim=128, output_dim=9) softmax = Softmax() bricks = [linear1, recurrent, linear2] for brick in bricks: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0) brick.initialize() linear1_output = linear1.apply(input) recurrent_output = recurrent.apply(linear1_output, mask=mask) linear2_output = linear2.apply(recurrent_output) shape = linear2_output.shape # 100 * 29*9 output = softmax.apply(linear2_output.reshape( (-1, 9))).reshape(shape) # hameye dimension ha be gheyr az yeki k oon 9 hast. # Cost and Functions cost = T.nnet.categorical_crossentropy(output, target) # 100 x 29 cost = cost * mask cost = cost.mean() params = Model(cost).parameters updates = sgd(cost, params) f_train = theano.function(inputs=[input, mask, target], outputs=cost,
activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) rnn.initialize() linear_output = Linear( name='linear_output', input_dim=hidden_layer_dim, output_dim=charset_size, weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize() softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x) hidden = rnn.apply(linear_input.apply(activation_input)) activation_output = linear_output.apply(hidden) y_est = softmax.apply(activation_output, extra_ndim=1) cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean() from blocks.graph import ComputationGraph from blocks.algorithms import GradientDescent, Adam cg = ComputationGraph([cost]) step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)] algorithm = GradientDescent(
def rnn_layer(dim, h, n): linear = Linear(input_dim=dim, output_dim=dim, name='linear' + str(n)) rnn = SimpleRecurrent(dim=dim, activation=Tanh(), name='rnn' + str(n)) initialize([linear, rnn]) return rnn.apply(linear.apply(h))
class BaselineCompositionalLayerToyBidirectional(Initializable): def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size, subword_RNN_hidden_state_size, add_one = True, **kwargs): super(BaselineCompositionalLayerToyBidirectional, self).__init__(**kwargs) self.batch_size = batch_size self.num_subwords = num_subwords # number of subwords which make up a word self.num_words = num_words # number of words in the sentence self.subword_embedding_size = subword_embedding_size self.input_vocab_size = input_vocab_size self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size self.add_one = add_one # adds 1 to the backwards embeddings # create the look up table self.lookup = LookupTable(length=self.input_vocab_size, dim=self.subword_embedding_size, name='input_lookup') self.lookup.weights_init = Uniform(width=0.08) self.lookup.biases_init = Constant(0) # has one RNN which reads the subwords into a word embedding self.compositional_subword_to_word_RNN_forward = SimpleRecurrent( dim=self.subword_RNN_hidden_state_size, activation=Identity(), name='subword_RNN_forward', weights_init=Identity_init()) self.compositional_subword_to_word_RNN_backward = SimpleRecurrent( dim=self.subword_RNN_hidden_state_size, activation=Identity(), name='subword_RNN_backward', weights_init=Identity_init()) self.children = [self.lookup, self.compositional_subword_to_word_RNN_forward, self.compositional_subword_to_word_RNN_backward] ''' The RNN will eat up the subwords dimension, resulting in a 3d tensor of shape = (num_subwords, batch_size, RNN_hidden_value_size * 2), which is returned as 'word_embeddings' NOTE: That it is the shape of num_subwords not num_words The backwords embbedding elements are +1, to show them as different from the forward ones. ''' @application(inputs=['subword_id_input_', 'subword_id_input_mask_'], outputs=['word_embeddings', 'word_embeddings_with_states', 'word_embeddings_mask']) def apply(self, subword_id_input_, subword_id_input_mask_): ##shape = (num_words, num_subwords, batch_size, embedding size) subword_embeddings = self.lookup.apply(subword_id_input_) #forward sequence forward_result, updates = theano.scan( #loop over each word and have the rnn eat up the subwords fn=lambda subword_embeddings, subword_id_input_mask_: self.compositional_subword_to_word_RNN_forward.apply(subword_embeddings, mask=subword_id_input_mask_), sequences= [subword_embeddings, subword_id_input_mask_]) forward_word_embeddings_with_states = forward_result.dimshuffle(1,0,2,3) # keep to check for values as output #DO NOT DIMSHUFFLE AS YOU WANT IT TO BE state 1 then state 2 then state 3 etc. s = forward_result.shape forward_word_embeddings = T.reshape(forward_result, (s[0]*s[1], s[2], s[3])) #backward sequence backward_result, updates = theano.scan( #loop over each word and have the rnn eat up the subwords fn=lambda subword_embeddings, subword_id_input_mask_: self.compositional_subword_to_word_RNN_backward.apply(subword_embeddings, mask=subword_id_input_mask_), sequences= [subword_embeddings[:,::-1,:], subword_id_input_mask_[:,::-1,:]]) #NOTE! added + 1 to backword embeddings to show them as different from forward embeddings if self.add_one: backward_result = backward_result + 1 backward_word_embeddings_with_states = backward_result.dimshuffle(1,0,2,3) # keep to check for values as output backward_word_embeddings = T.reshape(backward_result, (s[0]*s[1], s[2], s[3])) word_embeddings_with_states = T.concatenate([forward_word_embeddings_with_states, backward_word_embeddings_with_states], axis=3) word_embeddings_with_states = word_embeddings_with_states.dimshuffle(2,0,1,3)[-1] word_embeddings = T.concatenate([forward_word_embeddings, backward_word_embeddings], axis=2) #remove subword dim from mask #if subword is empty then word is emptry the word is emptry, if not then the word is used word_embeddings_mask_with_states = subword_id_input_mask_.max(axis=1) return word_embeddings, word_embeddings_with_states, word_embeddings_mask_with_states
dim=hidden_layer_dim, activation=Tanh(), weights_init=initialization.Uniform(width=0.01)) rnn.initialize() linear_output = Linear(name='linear_output', input_dim=hidden_layer_dim, output_dim=train_dataset.durations_vocab_size(), weights_init=initialization.Uniform(width=0.01), biases_init=Constant(0)) linear_output.initialize() softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x) hidden = rnn.apply(linear_input.apply(activation_input)) activation_output = linear_output.apply(hidden) y_est = softmax.apply(activation_output, extra_ndim=1) cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean() from blocks.graph import ComputationGraph from blocks.algorithms import GradientDescent, Adam cg = ComputationGraph([cost]) step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)] algorithm = GradientDescent(cost=cost, parameters=cg.parameters,
def __init__(self, rnn_dims, num_actions, data_X_np=None, data_y_np=None, width=32, height=32): ############################################################### # # Network and data setup # ############################################################## RNN_DIMS = 100 NUM_ACTIONS = num_actions tensor5 = T.TensorType('float32', [False, True, True, True, True]) self.x = T.tensor4('features') self.reward = T.tensor3('targets', dtype='float32') self.state = T.matrix('states', dtype='float32') self.hidden_states = [] # holds hidden states in np array form #data_X & data_Y supplied in init function now... if data_X_np is None or data_y_np is None: print 'you did not supply data at init' data_X_np = np.float32(np.random.normal(size=(1280, 1,1, width, height))) data_y_np = np.float32(np.random.normal(size=(1280, 1,1,1))) #data_states_np = np.float32(np.ones((1280, 1, 100))) state_shape = (data_X_np.shape[0],rnn_dims) self.data_states_np = np.float32(np.zeros(state_shape)) self.datastream = IterableDataset(dict(features=data_X_np, targets=data_y_np, states=self.data_states_np)).get_example_stream() self.datastream_test = IterableDataset(dict(features=data_X_np, targets=data_y_np, states=self.data_states_np)).get_example_stream() data_X = self.datastream # 2 conv inputs # we want to take our sequence of input images and convert them to convolutional # representations conv_layers = [ConvolutionalLayer(Rectifier().apply, (3, 3), 16, (2, 2), name='l1'), ConvolutionalLayer(Rectifier().apply, (3, 3), 32, (2, 2), name='l2'), ConvolutionalLayer(Rectifier().apply, (3, 3), 64, (2, 2), name='l3'), ConvolutionalLayer(Rectifier().apply, (3, 3), 128, (2, 2), name='l4'), ConvolutionalLayer(Rectifier().apply, (3, 3), 128, (2, 2), name='l5'), ConvolutionalLayer(Rectifier().apply, (3, 3), 128, (2, 2), name='l6')] convnet = ConvolutionalSequence(conv_layers, num_channels=4, image_size=(width, height), weights_init=init.Uniform(0, 0.01), biases_init=init.Constant(0.0), tied_biases=False, border_mode='full') convnet.initialize() output_dim = np.prod(convnet.get_dim('output')) conv_out = convnet.apply(self.x) reshape_dims = (conv_out.shape[0], conv_out.shape[1]*conv_out.shape[2]*conv_out.shape[3]) hidden_repr = conv_out.reshape(reshape_dims) conv2rnn = Linear(input_dim=output_dim, output_dim=RNN_DIMS, weights_init=init.Uniform(width=0.01), biases_init=init.Constant(0.)) conv2rnn.initialize() conv2rnn_output = conv2rnn.apply(hidden_repr) # RNN hidden layer # then we want to feed those conv representations into an RNN rnn = SimpleRecurrent(dim=RNN_DIMS, activation=Rectifier(), weights_init=init.Uniform(width=0.01)) rnn.initialize() self.learned_state = rnn.apply(inputs=conv2rnn_output, states=self.state, iterate=False) # linear output from hidden layer # the RNN has two outputs, but only this one has a target. That is, this is "expected return" # which the network attempts to minimize difference between expected return and actual return lin_output = Linear(input_dim=RNN_DIMS, output_dim=1, weights_init=init.Uniform(width=0.01), biases_init=init.Constant(0.)) lin_output.initialize() self.exp_reward = lin_output.apply(self.learned_state) self.get_exp_reward = theano.function([self.x, self.state], self.exp_reward) # softmax output from hidden layer # this provides a softmax of action recommendations # the hypothesis is that adjusting the other outputs magically influences this set of outputs # to suggest smarter (or more realistic?) moves action_output = Linear(input_dim=RNN_DIMS, output_dim=NUM_ACTIONS, weights_init=init.Constant(.001), biases_init=init.Constant(0.)) action_output.initialize() self.suggested_actions = Softmax().apply(action_output.apply(self.learned_state[-1])) ###################### # use this to get suggested actions... it requires the state of the hidden units from the previous # timestep ##################### self.get_suggested_actions = theano.function([self.x, self.state], [self.suggested_actions, self.learned_state])
def test_attention_recurrent(): rng = numpy.random.RandomState(1234) dim = 5 batch_size = 4 input_length = 20 attended_dim = 10 attended_length = 15 wrapped = SimpleRecurrent(dim, Identity()) attention = SequenceContentAttention(state_names=wrapped.apply.states, attended_dim=attended_dim, match_dim=attended_dim) recurrent = AttentionRecurrent(wrapped, attention, seed=1234) recurrent.weights_init = IsotropicGaussian(0.5) recurrent.biases_init = Constant(0) recurrent.initialize() attended = tensor.tensor3("attended") attended_mask = tensor.matrix("attended_mask") inputs = tensor.tensor3("inputs") inputs_mask = tensor.matrix("inputs_mask") outputs = recurrent.apply(inputs=inputs, mask=inputs_mask, attended=attended, attended_mask=attended_mask) states, glimpses, weights = outputs assert states.ndim == 3 assert glimpses.ndim == 3 assert weights.ndim == 3 # For values. def rand(size): return rng.uniform(size=size).astype(theano.config.floatX) # For masks. def generate_mask(length, batch_size): mask = numpy.ones((length, batch_size), dtype=theano.config.floatX) # To make it look like read data for i in range(batch_size): mask[1 + rng.randint(0, length - 1):, i] = 0.0 return mask input_vals = rand((input_length, batch_size, dim)) input_mask_vals = generate_mask(input_length, batch_size) attended_vals = rand((attended_length, batch_size, attended_dim)) attended_mask_vals = generate_mask(attended_length, batch_size) func = theano.function([inputs, inputs_mask, attended, attended_mask], [states, glimpses, weights]) states_vals, glimpses_vals, weight_vals = func(input_vals, input_mask_vals, attended_vals, attended_mask_vals) assert states_vals.shape == (input_length, batch_size, dim) assert glimpses_vals.shape == (input_length, batch_size, attended_dim) assert (len(ComputationGraph(outputs).shared_variables) == len( Selector(recurrent).get_parameters())) # Manual reimplementation inputs2d = tensor.matrix() states2d = tensor.matrix() mask1d = tensor.vector() weighted_averages = tensor.matrix() distribute_func = theano.function([inputs2d, weighted_averages], recurrent.distribute.apply( inputs=inputs2d, weighted_averages=weighted_averages)) wrapped_apply_func = theano.function([states2d, inputs2d, mask1d], wrapped.apply(states=states2d, inputs=inputs2d, mask=mask1d, iterate=False)) attention_func = theano.function([states2d, attended, attended_mask], attention.take_glimpses( attended=attended, attended_mask=attended_mask, states=states2d)) states_man = wrapped.initial_states(batch_size).eval() glimpses_man = numpy.zeros((batch_size, attended_dim), dtype=theano.config.floatX) for i in range(input_length): inputs_man = distribute_func(input_vals[i], glimpses_man) states_man = wrapped_apply_func(states_man, inputs_man, input_mask_vals[i]) glimpses_man, weights_man = attention_func(states_man, attended_vals, attended_mask_vals) assert_allclose(states_man, states_vals[i], rtol=1e-5) assert_allclose(glimpses_man, glimpses_vals[i], rtol=1e-5) assert_allclose(weights_man, weight_vals[i], rtol=1e-5) # weights for not masked position must be zero assert numpy.all(weight_vals * (1 - attended_mask_vals.T) == 0) # weights for masked positions must be non-zero assert numpy.all(abs(weight_vals + (1 - attended_mask_vals.T)) > 1e-5) # weights from different steps should be noticeably different assert (abs(weight_vals[0] - weight_vals[1])).sum() > 1e-2 # weights for all state after the last masked position should be same for i in range(batch_size): last = int(input_mask_vals[:, i].sum()) for j in range(last, input_length): assert_allclose(weight_vals[last, i], weight_vals[j, i], 1e-5)
brick.initialize() lstm.weights_init = IsotropicGaussian(0.01) #lstm.weights_init = Orthogonal() lstm.biases_init = Constant(0.) lstm.initialize() #ComputationGraph(encode.apply(x)).get_theano_function()(features_test)[0].shape #ComputationGraph(lstm.apply(encoded)).get_theano_function()(features_test) #ComputationGraph(decode.apply(hiddens[-1])).get_theano_function()(features_test)[0].shape #ComputationGraph(SquaredError().apply(y, y_hat.flatten())).get_theano_function()(features_test, targets_test)[0].shape encoded = encode.apply(x) #hiddens = lstm.apply(encoded, gates.apply(x)) hiddens = lstm.apply(encoded) y_hat = decode.apply(hiddens[-1]) cost = SquaredError().apply(y, y_hat) cost.name = 'cost' #ipdb.set_trace() #ComputationGraph(y_hat).get_theano_function()(features_test)[0].shape #ComputationGraph(cost).get_theano_function()(features_test, targets_test)[0].shape cg = ComputationGraph(cost) #cg = ComputationGraph(hiddens).get_theano_function() #ipdb.set_trace() algorithm = GradientDescent(cost=cost,
brick.initialize() lstm.weights_init = IsotropicGaussian(0.01) #lstm.weights_init = Orthogonal() lstm.biases_init = Constant(0.) lstm.initialize() #ComputationGraph(encode.apply(x)).get_theano_function()(features_test)[0].shape #ComputationGraph(lstm.apply(encoded)).get_theano_function()(features_test) #ComputationGraph(decode.apply(hiddens[-1])).get_theano_function()(features_test)[0].shape #ComputationGraph(SquaredError().apply(y, y_hat.flatten())).get_theano_function()(features_test, targets_test)[0].shape encoded = encode.apply(x) #hiddens = lstm.apply(encoded, gates.apply(x)) hiddens = lstm.apply(encoded) y_hat = decode.apply(hiddens[-1]) cost = SquaredError().apply(y, y_hat) cost.name = 'cost' #ipdb.set_trace() #ComputationGraph(y_hat).get_theano_function()(features_test)[0].shape #ComputationGraph(cost).get_theano_function()(features_test, targets_test)[0].shape cg = ComputationGraph(cost) #cg = ComputationGraph(hiddens).get_theano_function() #ipdb.set_trace() algorithm = GradientDescent(cost=cost,
class Rnn(Initializable, BaseRecurrent): def __init__(self, dims=(88, 100, 100), **kwargs): super(Rnn, self).__init__(**kwargs) self.dims = dims self.input_transform = Linear( input_dim=dims[0], output_dim=dims[1], weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0.0), use_bias=False, name="input_transfrom") self.gru_layer = SimpleRecurrent(dim=dims[1], activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=True, name="gru_rnn_layer") # TODO: find a way to automatically set the output dim in case of lstm vs normal rnn self.linear_trans = Linear(input_dim=dims[1], output_dim=dims[2] * 4, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=False, name="h2h_transform") self.lstm_layer = LSTM(dim=dims[2], activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=True, name="lstm_rnn_layer") self.out_transform = MLP(activations=[Sigmoid()], dims=[dims[2], dims[0]], weights_init=IsotropicGaussian(0.01), use_bias=True, biases_init=Constant(0.0), name="out_layer") self.children = [ self.input_transform, self.gru_layer, self.linear_trans, self.lstm_layer, self.out_transform ] # @recurrent(sequences=['inputs', 'input_mask'], contexts=[], # states=['gru_state', 'lstm_state', 'lstm_cells'], # outputs=['gru_state', 'lstm_state', 'lstm_cells']) def rnn_apply(self, inputs, mask=None, gru_state=None, lstm_state=None, lstm_cells=None): input_transform = self.input_transform.apply(inputs) gru_state = self.gru_layer.apply( inputs=input_transform, # update_inputs=input_transform, # reset_inputs=input_transform, states=gru_state, mask=mask, iterate=False) lstm_transform = self.linear_trans.apply(gru_state) lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state, cells=lstm_cells, mask=mask, iterate=False) return gru_state, lstm_state, lstm_cells @recurrent(sequences=[], contexts=[], states=['inputs', 'gru_state', 'lstm_state', 'lstm_cells'], outputs=['inputs', 'gru_state', 'lstm_state', 'lstm_cells']) def rnn_generate(self, inputs=None, gru_state=None, lstm_state=None, lstm_cells=None): output = self.apply(inputs=inputs, gru_state=gru_state, lstm_state=lstm_state, lstm_cells=lstm_cells, iterate=False) return output, gru_state, lstm_state, lstm_cells @recurrent(sequences=['inputs', 'mask'], contexts=[], states=['gru_state', 'lstm_state', 'lstm_cells'], outputs=['output', 'gru_state', 'lstm_state', 'lstm_cells']) def apply(self, inputs, mask, gru_state=None, lstm_state=None, lstm_cells=None): # input_transform = self.input_transform.apply(inputs) # gru_state = self.gru_layer.apply( # inputs=input_transform, # mask=mask, # states=gru_state, # iterate=False) # lstm_transform = self.linear_trans.apply(gru_state) # lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state, # cells=lstm_cells, # mask=mask, iterate=False) gru_state, lstm_state, lstm_cells = self.rnn_apply( inputs=inputs, mask=mask, gru_state=gru_state, lstm_state=lstm_state, lstm_cells=lstm_cells) output = 1.17 * self.out_transform.apply(lstm_state) * mask[:, None] return output, gru_state, lstm_state, lstm_cells def get_dim(self, name): dims = dict(zip(['outputs', 'gru_state', 'lstm_state'], self.dims)) dims['lstm_cells'] = dims['lstm_state'] return dims.get(name, None) or super(Rnn, self).get_dim(name)
def test_attention_recurrent(): rng = numpy.random.RandomState(1234) dim = 5 batch_size = 4 input_length = 20 attended_dim = 10 attended_length = 15 wrapped = SimpleRecurrent(dim, Identity()) attention = SequenceContentAttention( state_names=wrapped.apply.states, attended_dim=attended_dim, match_dim=attended_dim) recurrent = AttentionRecurrent(wrapped, attention, seed=1234) recurrent.weights_init = IsotropicGaussian(0.5) recurrent.biases_init = Constant(0) recurrent.initialize() attended = tensor.tensor3("attended") attended_mask = tensor.matrix("attended_mask") inputs = tensor.tensor3("inputs") inputs_mask = tensor.matrix("inputs_mask") outputs = recurrent.apply( inputs=inputs, mask=inputs_mask, attended=attended, attended_mask=attended_mask) states, glimpses, weights = outputs assert states.ndim == 3 assert glimpses.ndim == 3 assert weights.ndim == 3 # For values. def rand(size): return rng.uniform(size=size).astype(theano.config.floatX) # For masks. def generate_mask(length, batch_size): mask = numpy.ones((length, batch_size), dtype=theano.config.floatX) # To make it look like read data for i in range(batch_size): mask[1 + rng.randint(0, length - 1):, i] = 0.0 return mask input_vals = rand((input_length, batch_size, dim)) input_mask_vals = generate_mask(input_length, batch_size) attended_vals = rand((attended_length, batch_size, attended_dim)) attended_mask_vals = generate_mask(attended_length, batch_size) func = theano.function([inputs, inputs_mask, attended, attended_mask], [states, glimpses, weights]) states_vals, glimpses_vals, weight_vals = func( input_vals, input_mask_vals, attended_vals, attended_mask_vals) assert states_vals.shape == (input_length, batch_size, dim) assert glimpses_vals.shape == (input_length, batch_size, attended_dim) assert (len(ComputationGraph(outputs).shared_variables) == len(Selector(recurrent).get_parameters())) # Manual reimplementation inputs2d = tensor.matrix() states2d = tensor.matrix() mask1d = tensor.vector() weighted_averages = tensor.matrix() distribute_func = theano.function( [inputs2d, weighted_averages], recurrent.distribute.apply( inputs=inputs2d, weighted_averages=weighted_averages)) wrapped_apply_func = theano.function( [states2d, inputs2d, mask1d], wrapped.apply( states=states2d, inputs=inputs2d, mask=mask1d, iterate=False)) attention_func = theano.function( [states2d, attended, attended_mask], attention.take_glimpses( attended=attended, attended_mask=attended_mask, states=states2d)) states_man = wrapped.initial_states(batch_size).eval() glimpses_man = numpy.zeros((batch_size, attended_dim), dtype=theano.config.floatX) for i in range(input_length): inputs_man = distribute_func(input_vals[i], glimpses_man) states_man = wrapped_apply_func(states_man, inputs_man, input_mask_vals[i]) glimpses_man, weights_man = attention_func( states_man, attended_vals, attended_mask_vals) assert_allclose(states_man, states_vals[i], rtol=1e-5) assert_allclose(glimpses_man, glimpses_vals[i], rtol=1e-5) assert_allclose(weights_man, weight_vals[i], rtol=1e-5) # weights for not masked position must be zero assert numpy.all(weight_vals * (1 - attended_mask_vals.T) == 0) # weights for masked positions must be non-zero assert numpy.all(abs(weight_vals + (1 - attended_mask_vals.T)) > 1e-5) # weights from different steps should be noticeably different assert (abs(weight_vals[0] - weight_vals[1])).sum() > 1e-2 # weights for all state after the last masked position should be same for i in range(batch_size): last = int(input_mask_vals[:, i].sum()) for j in range(last, input_length): assert_allclose(weight_vals[last, i], weight_vals[j, i], 1e-5)
h_dim = 100 o_dim = 10 batch_size = 50 print 'Building model ...' # T x B x F x = tensor.tensor3('x', dtype=floatX) y = tensor.tensor3('y', dtype='int32') x_to_h1 = Linear(name='x_to_h1', input_dim=x_dim, output_dim=h_dim) pre_rnn = x_to_h1.apply(x) rnn = SimpleRecurrent(activation=Rectifier(), dim=h_dim, name="rnn") h1 = rnn.apply(pre_rnn) h1_to_o = Linear(name='h1_to_o', input_dim=h_dim, output_dim=o_dim) pre_softmax = h1_to_o.apply(h1) softmax = Softmax() shape = pre_softmax.shape softmax_out = softmax.apply(pre_softmax.reshape((-1, o_dim))) softmax_out = softmax_out.reshape(shape) softmax_out.name = 'softmax_out' # comparing only last time-step cost = CategoricalCrossEntropy().apply(y[-1, :, 0], softmax_out[-1]) cost.name = 'CrossEntropy' error_rate = MisclassificationRate().apply(y[-1, :, 0], softmax_out[-1]) error_rate.name = 'error_rate'
class SimpleRecurrentLayer(Initializable, Feedforward): """ Blocks implementation of SimpleRecurrent is general and only handles the "recursive part". This class wraps the SimpleRecurrent class and adds linear input transformation. It can be used for most basic cases as a layer in a sequence of layers. Parameters ---------- input_dim : int state_dim : int activation : Brick state_weights_init : NdarrayInitialization Initialization of weights in LSTM (including gates). input_weights_init : NdarrayInitialization Initialization of weights in linear transformation of input. biases_init : NdarrayInitialization Initialization of biases in linear transformation of input. """ @lazy() def __init__( self, input_dim, state_dim, activation=Tanh(), state_weights_init=None, input_weights_init=None, biases_init=None, **kwargs ): super(SimpleRecurrentLayer, self).__init__(biases_init=biases_init, **kwargs) if state_weights_init is None: state_weights_init = init.IsotropicGaussian(0.01) if input_weights_init is None: input_weights_init = init.IsotropicGaussian(0.01) if biases_init is None: biases_init = init.Constant(0) self.input_transformation = Linear( input_dim=input_dim, output_dim=state_dim, weights_init=input_weights_init, biases_init=biases_init ) self.rnn = SimpleRecurrent(dim=state_dim, activation=activation, weights_init=state_weights_init) self.children = [self.input_transformation, self.rnn] @application def apply(self, inputs, *args, **kwargs): """ Transforms input, sends to BasicRecurrent and returns output. Parameters ---------- inputs : tensor.TensorVariable The 3 dimensional tensor of inputs in the shape (timesteps, batch_size, features). Returns ------- outputs : tensor.TensorVariable The 3 dimensional tensor of outputs in the shape (timesteps, batch_size, features). """ rnn_inputs = self.input_transformation.apply(inputs) outputs = self.rnn.apply(inputs=rnn_inputs, *args, **kwargs) return outputs @apply.delegate def apply_delegate(self): return self.children[0].apply @property def input_dim(self): return self.input_transformation.input_dim @input_dim.setter def input_dim(self, value): self.input_transformation.input_dim = value @property def output_dim(self): return self.rnn.dim @output_dim.setter def output_dim(self, value): self.rnn.dim = value
class EUTHM(UTHM): ''' UTH model with extend information ''' def __init__(self, config, dataset, *args, **kwargs): super(EUTHM, self).__init__(config, dataset) def _define_inputs(self, *args, **kwargs): super(EUTHM, self)._define_inputs() self.user_word = tensor.ivector('user_word') self.user_word_sparse_mask = tensor.vector('user_word_sparse_mask', dtype=theano.config.floatX) self.user_word_left_idx = tensor.ivector('user_word_idx_left_idx') self.user_word_right_idx = tensor.ivector('user_word_idx_right_idx') self.hashtag_word = tensor.ivector('hashtag_word') self.hashtag_sparse_mask = tensor.vector('hashtag_word_sparse_mask', dtype=theano.config.floatX) self.hashtag_word_left_idx = tensor.ivector( 'hashtag_word_idx_left_idx') self.hashtag_word_right_idx = tensor.ivector( 'hashtag_word_idx_right_idx') self.sparse_word = tensor.imatrix('sparse_word') self.sparse_word_sparse_mask = tensor.vector( 'sparse_word_sparse_mask', dtype=theano.config.floatX) self.sparse_word_mask = tensor.matrix('sparse_word_mask', dtype=theano.config.floatX) self.sparse_word_left_idx = tensor.ivector('sparse_word_idx_left_idx') self.sparse_word_right_idx = tensor.ivector( 'sparse_word_idx_right_idx') def _build_bricks(self, *args, **kwargs): # Build lookup tables super(EUTHM, self)._build_bricks() self.user2word = MLP( activations=[Tanh('user2word_tanh')], dims=[self.config.user_embed_dim, self.config.word_embed_dim], name='user2word_mlp') self.user2word.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.user2word.biases_init = Constant(0) self.user2word.initialize() self.hashtag2word = MLP( activations=[Tanh('hashtag2word_tanh')], dims=[ self.config.user_embed_dim + self.config.word_embed_dim, self.config.word_embed_dim ], name='hashtag2word_mlp') self.hashtag2word.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.hashtag2word.biases_init = Constant(0) self.hashtag2word.initialize() self.user2word_bias = Bias(dim=1, name='user2word_bias') self.user2word_bias.biases_init = Constant(0) self.user2word_bias.initialize() self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias') self.hashtag2word_bias.biases_init = Constant(0) self.hashtag2word_bias.initialize() #Build character embedding self.char_embed = self._embed(len(self.dataset.char2index), self.config.char_embed_dim, name='char_embed') # Build sparse word encoder self.rnn_ins = Linear(input_dim=self.config.char_embed_dim, output_dim=self.config.word_embed_dim, name='rnn_in') self.rnn_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim + self.config.word_embed_dim)) self.rnn_ins.biases_init = Constant(0) self.rnn_ins.initialize() self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim, activation=Tanh()) self.rnn.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.rnn.initialize() def _set_OV_value(self, *args, **kwargs): '''Train a <unk> representation''' tensor.set_subtensor( self.char_embed.W[self.dataset.char2index['<unk>']], numpy.zeros(self.config.char_embed_dim, dtype=theano.config.floatX)) def _get_text_vec(self, *args, **kwargs): # Transpose text self.text = self.text.dimshuffle(1, 0) self.text_mask = self.text_mask.dimshuffle(1, 0) self.sparse_word = self.sparse_word.dimshuffle(1, 0) self.sparse_word_mask = self.sparse_word_mask.dimshuffle(1, 0) # Turn word, user and hashtag into vector representation text_vec = self.word_embed.apply(self.text) # Apply user word, hashtag word and url text_vec = self._apply_user_word(text_vec) text_vec = self._apply_hashtag_word(text_vec) text_vec = self._apply_sparse_word(text_vec) return text_vec @abstractmethod def _apply_user_word(self, text_vec, *args, **kwargs): ''' Replace @a with transformed author vector :param text_vec: :param args: :param kwargs: :return: ''' user_word_vec = self.user2word.apply(self.user_embed.apply(self.user_word)) + \ self.user2word_bias.parameters[0][0] text_vec = tensor.set_subtensor( text_vec[self.user_word_right_idx, self.user_word_left_idx], text_vec[self.user_word_right_idx, self.user_word_left_idx] * (1 - self.user_word_sparse_mask[:, None]) + user_word_vec * self.user_word_sparse_mask[:, None]) return text_vec @abstractmethod def _apply_hashtag_word(self, text_vec, *args, **kwargs): ''' Replace #h with transformed hashtag vector :param text_vec: :param args: :param kwargs: :return: ''' hashtag_word_vec = self.hashtag2word.apply(self.hashtag_embed.apply(self.hashtag_word)) +\ self.hashtag2word_bias.parameters[0][0] text_vec = tensor.set_subtensor( text_vec[self.hashtag_word_right_idx, self.hashtag_word_left_idx], text_vec[self.hashtag_word_right_idx, self.hashtag_word_left_idx] * (1 - self.hashtag_sparse_mask[:, None]) + hashtag_word_vec * self.hashtag_sparse_mask[:, None]) return text_vec @abstractmethod def _apply_sparse_word(self, text_vec, *args, **kwargs): ''' Replace sparse word encoding with character embedding. (maybe lstm) :param text_vec: :param args: :param kwargs: :return: ''' sparse_word_vec = self.char_embed.apply(self.sparse_word) sparse_word_hiddens = self.rnn.apply( inputs=self.rnn_ins.apply(sparse_word_vec), mask=self.sparse_word_mask) tmp = sparse_word_hiddens[-1] text_vec = tensor.set_subtensor( text_vec[self.sparse_word_right_idx, self.sparse_word_left_idx], text_vec[self.sparse_word_right_idx, self.sparse_word_left_idx] * (1 - self.sparse_word_sparse_mask[:, None]) + tmp * self.sparse_word_sparse_mask[:, None]) return text_vec
n_epochs = 30 x_dim = 1 h_dim = 100 o_dim = 10 batch_size = 50 print 'Building model ...' # T x B x F x = tensor.tensor3('x', dtype=floatX) y = tensor.tensor3('y', dtype='int32') x_to_h1 = Linear(name='x_to_h1', input_dim=x_dim, output_dim=h_dim) pre_rnn = x_to_h1.apply(x) rnn = SimpleRecurrent(activation=Rectifier(), dim=h_dim, name="rnn") h1 = rnn.apply(pre_rnn) h1_to_o = Linear(name='h1_to_o', input_dim=h_dim, output_dim=o_dim) pre_softmax = h1_to_o.apply(h1) softmax = Softmax() shape = pre_softmax.shape softmax_out = softmax.apply(pre_softmax.reshape((-1, o_dim))) softmax_out = softmax_out.reshape(shape) softmax_out.name = 'softmax_out' # comparing only last time-step cost = CategoricalCrossEntropy().apply(y[-1, :, 0], softmax_out[-1]) cost.name = 'CrossEntropy' error_rate = MisclassificationRate().apply(y[-1, :, 0], softmax_out[-1]) error_rate.name = 'error_rate' # Initialization
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') x_int = x.astype(dtype='int32').T train_dataset = TextFile('inspirational.txt') train_dataset.indexables[0] = numpy.array(sorted( train_dataset.indexables[0], key=len )) n_voc = len(train_dataset.dict.keys()) init_probs = numpy.array( [sum(filter(lambda idx:idx == w, [s[0] for s in train_dataset.indexables[ train_dataset.sources.index('features')]] )) for w in xrange(n_voc)], dtype=theano.config.floatX ) init_probs = init_probs / init_probs.sum() n_h = 100 linear_embedding = LookupTable( length=n_voc, dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = SimpleRecurrent( dim=n_h, activation=Tanh(), weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) rnn.initialize() score_layer = Linear( input_dim=n_h, output_dim=n_voc, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = (linear_embedding.apply(x_int[:-1]) * tensor.shape_padright(m.T[1:])) rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:]) probs = softmax( sequence_map(score_layer.apply, rnn_out, mask=m.T[1:])[0] ) idx_mask = m.T[1:].nonzero() cost = CategoricalCrossEntropy().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) cost.name = 'cost' misclassification = MisclassificationRate().apply( x_int[1:][idx_mask[0], idx_mask[1]], probs[idx_mask[0], idx_mask[1]] ) misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=Adam() ) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=train_dataset.num_examples, batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) batch_size = 10 length = 30 trng = MRG_RandomStreams(18032015) u = trng.uniform(size=(length, batch_size, n_voc)) gumbel_noise = -tensor.log(-tensor.log(u)) init_samples = (tensor.log(init_probs).dimshuffle(('x', 0)) + gumbel_noise[0]).argmax(axis=-1) init_states = rnn.initial_state('states', batch_size) def sampling_step(g_noise, states, samples_step): embedding_step = linear_embedding.apply(samples_step) next_states = rnn.apply(inputs=embedding_step, states=states, iterate=False) probs_step = softmax(score_layer.apply(next_states)) next_samples = (tensor.log(probs_step) + g_noise).argmax(axis=-1) return next_states, next_samples [_, samples], _ = theano.scan( fn=sampling_step, sequences=[gumbel_noise[1:]], outputs_info=[init_states, init_samples] ) sampling = theano.function([], samples.owner.inputs[0].T) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification']], titles=['Costs'])) extensions.append(PlotManager('Language modelling example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) extensions.append(PrintSamples(sampler=sampling, voc=train_dataset.inv_dict)) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
labels_mask = data[b'mask_labels'] print('Building model ...') # T x B x F x = tensor.tensor3('x', dtype=floatX) # T x B x_mask = tensor.matrix('x_mask', dtype=floatX) # L x B y = tensor.matrix('y', dtype=floatX) # L x B y_mask = tensor.matrix('y_mask', dtype=floatX) x_to_h = Linear(name='x_to_h', input_dim=x_dim, output_dim=h_dim) x_transform = x_to_h.apply(x) rnn = SimpleRecurrent(activation=Tanh(), dim=h_dim, name="rnn") h = rnn.apply(x_transform) h_to_o = Linear(name='h_to_o', input_dim=h_dim, output_dim=num_classes + 1) h_transform = h_to_o.apply(h) # T x B x C+1 y_hat = tensor.nnet.softmax(h_transform.reshape( (-1, num_classes + 1))).reshape((h.shape[0], h.shape[1], -1)) y_hat.name = 'y_hat' y_hat_mask = x_mask cost = CTC().apply(y, y_hat, y_mask, y_hat_mask, 'normal_scale') cost.name = 'CTC' # Initialization for brick in (rnn, x_to_h, h_to_o): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0) brick.initialize()
def construct_model(vocab_size, embedding_dim, hidden_dim, activation): # Construct the model x = tensor.lmatrix('features') x_mask = tensor.fmatrix('features_mask') y = tensor.lmatrix('targets') # Batch X Time y_mask = tensor.fmatrix('targets_mask') # Batch X Time frequency_mask = tensor.fmatrix('frequency_mask') frequency_mask_mask = tensor.fmatrix('frequency_mask_mask') # Only for the validation last_word = tensor.lvector('last_word') lookup = LookupTable(length=vocab_size, dim=embedding_dim, name='lookup') linear = Linear(input_dim=embedding_dim, output_dim=hidden_dim, name="linear") hidden = SimpleRecurrent(dim=hidden_dim, activation=activation, name='hidden_recurrent') top_linear = Linear(input_dim=hidden_dim, output_dim=vocab_size, name="top_linear") # Return 3D Tensor: Batch X Time X embedding_dim embeddings = lookup.apply(x) # Give time as the first index: Time X Batch X embedding_dim embeddings = embeddings.dimshuffle(1, 0, 2) pre_recurrent = linear.apply(embeddings) after_recurrent = hidden.apply(inputs=pre_recurrent, mask=x_mask.T)[:-1] after_recurrent_last = after_recurrent[-1] presoft = top_linear.apply(after_recurrent) # Define the cost # Give y as a vector and reshape presoft to 2D tensor y = y.flatten() shape = presoft.shape presoft = presoft.dimshuffle(1, 0, 2) presoft = presoft.reshape((shape[0] * shape[1], shape[2])) # Build cost_matrix presoft = presoft - presoft.max(axis=1).dimshuffle(0, 'x') log_prob = presoft - \ tensor.log(tensor.exp(presoft).sum(axis=1).dimshuffle(0, 'x')) flat_log_prob = log_prob.flatten() range_ = tensor.arange(y.shape[0]) flat_indices = y + range_ * presoft.shape[1] cost_matrix = flat_log_prob[flat_indices] # Mask useless values from the cost_matrix cost_matrix = - cost_matrix * \ y_mask.flatten() * frequency_mask.flatten() * \ frequency_mask_mask.flatten() # Average the cost cost = cost_matrix.sum() cost = cost / (y_mask * frequency_mask).sum() # Initialize parameters for brick in (lookup, linear, hidden, top_linear): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.) brick.initialize() return cost
class CompositionalLayerToyBidirectional(Initializable): def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size, subword_RNN_hidden_state_size, add_one = True, **kwargs): super(CompositionalLayerToyBidirectional, self).__init__(**kwargs) self.batch_size = batch_size self.num_subwords = num_subwords # number of subwords which make up a word self.num_words = num_words # number of words in the sentence self.subword_embedding_size = subword_embedding_size self.input_vocab_size = input_vocab_size self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size self.add_one = add_one #adds 1 to the backwards embeddings # create the look up table self.lookup = LookupTable(length=self.input_vocab_size, dim=self.subword_embedding_size, name='input_lookup') self.lookup.weights_init = Uniform(width=0.08) self.lookup.biases_init = Constant(0) # has one RNN which reads the subwords into a word embedding self.compositional_subword_to_word_RNN_forward = SimpleRecurrent( dim=self.subword_RNN_hidden_state_size, activation=Identity(), name='subword_RNN_forward', weights_init=Identity_init()) self.compositional_subword_to_word_RNN_backward = SimpleRecurrent( dim=self.subword_RNN_hidden_state_size, activation=Identity(), name='subword_RNN_backward', weights_init=Identity_init()) self.children = [self.lookup, self.compositional_subword_to_word_RNN_forward, self.compositional_subword_to_word_RNN_backward] ''' The RNN will eat up the subwords dimension, resulting in a 3d tensor of shape = (num_words, batch_size, RNN_hidden_value_size * 2), which is returned as 'word_embeddings' The backwords embbedding elements are +1, to show them as different from the forward ones. ''' @application(inputs=['subword_id_input_', 'subword_id_input_mask_'], outputs=['word_embeddings', 'word_embeddings_mask']) def apply(self, subword_id_input_, subword_id_input_mask_): ##shape = (num_words, num_subwords, batch_size, embedding size) subword_embeddings = self.lookup.apply(subword_id_input_) forward_result, updates = theano.scan( #loop over each word and have the rnn eat up the subwords fn=lambda subword_embeddings, subword_id_input_mask_: self.compositional_subword_to_word_RNN_forward.apply(subword_embeddings, mask=subword_id_input_mask_), sequences= [subword_embeddings, subword_id_input_mask_]) forward_word_embeddings = forward_result.dimshuffle(1,0,2,3) #put the states as the last dimension forward_word_embeddings = forward_word_embeddings[-1] #take only the last state, since we dont need the others backward_result, updates = theano.scan( #loop over each word and have the rnn eat up the subwords fn=lambda subword_embeddings, subword_id_input_mask_: self.compositional_subword_to_word_RNN_backward.apply(subword_embeddings, mask=subword_id_input_mask_), sequences= [subword_embeddings[:,::-1,:], subword_id_input_mask_[:,::-1,:]]) backward_word_embeddings = backward_result.dimshuffle(1,0,2,3) #put the states as the last dimension backward_word_embeddings = backward_word_embeddings[-1] #take only the last state, since we dont need the others # NOTE! added + 1 to backword embeddings to show them as different from forward embeddings backward_word_embeddings = backward_word_embeddings + 1.0 word_embeddings = T.concatenate([forward_word_embeddings, backward_word_embeddings], axis=2) #remove subword dim from mask #if subword is empty then word is emptry the word is emptry, if not then the word is used word_embeddings_mask = subword_id_input_mask_.max(axis=1) return word_embeddings, word_embeddings_mask
def construct_model(vocab_size, embedding_dim, hidden_dim, activation): # Construct the model x = tensor.lmatrix('features') x_mask = tensor.fmatrix('features_mask') y = tensor.lmatrix('targets') # Batch X Time y_mask = tensor.fmatrix('targets_mask') # Batch X Time frequency_mask = tensor.fmatrix('frequency_mask') frequency_mask_mask = tensor.fmatrix('frequency_mask_mask') # Only for the validation last_word = tensor.lvector('last_word') lookup = LookupTable(length=vocab_size, dim=embedding_dim, name='lookup') linear = Linear(input_dim=embedding_dim, output_dim=hidden_dim, name="linear") hidden = SimpleRecurrent(dim=hidden_dim, activation=activation, name='hidden_recurrent') top_linear = Linear(input_dim=hidden_dim, output_dim=vocab_size, name="top_linear") # Return 3D Tensor: Batch X Time X embedding_dim embeddings = lookup.apply(x) # Give time as the first index: Time X Batch X embedding_dim embeddings = embeddings.dimshuffle(1, 0, 2) pre_recurrent = linear.apply(embeddings) after_recurrent = hidden.apply(inputs=pre_recurrent, mask=x_mask.T)[:-1] after_recurrent_last = after_recurrent[-1] presoft = top_linear.apply(after_recurrent) # Define the cost # Give y as a vector and reshape presoft to 2D tensor y = y.flatten() shape = presoft.shape presoft = presoft.dimshuffle(1, 0, 2) presoft = presoft.reshape((shape[0] * shape[1], shape[2])) # Build cost_matrix presoft = presoft - presoft.max(axis=1).dimshuffle(0, 'x') log_prob = presoft - \ tensor.log(tensor.exp(presoft).sum(axis=1).dimshuffle(0, 'x')) flat_log_prob = log_prob.flatten() range_ = tensor.arange(y.shape[0]) flat_indices = y + range_ * presoft.shape[1] cost_matrix = flat_log_prob[flat_indices] # Mask useless values from the cost_matrix cost_matrix = - cost_matrix * \ y_mask.flatten() * frequency_mask.flatten() * \ frequency_mask_mask.flatten() # Average the cost cost = cost_matrix.sum() cost = cost / (y_mask * frequency_mask).sum() # Initialize parameters for brick in (lookup, linear, hidden, top_linear): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.) brick.initialize() return cost
iteration = 300 # number of epochs of gradient descent print "Building Model" # Symbolic variables x = tensor.tensor3('x', dtype=floatX) target = tensor.tensor3('target', dtype=floatX) # Build the model linear = Linear(input_dim=n_u, output_dim=n_h, name="first_layer") rnn = SimpleRecurrent(dim=n_h, activation=Tanh()) linear2 = Linear(input_dim=n_h, output_dim=n_y, name="output_layer") sigm = Sigmoid() x_transform = linear.apply(x) h = rnn.apply(x_transform) predict = sigm.apply(linear2.apply(h)) # only for generation B x h_dim h_initial = tensor.tensor3('h_initial', dtype=floatX) h_testing = rnn.apply(x_transform, h_initial, iterate=False) y_hat_testing = linear2.apply(h_testing) y_hat_testing = sigm.apply(y_hat_testing) y_hat_testing.name = 'y_hat_testing' # Cost function cost = SquaredError().apply(predict, target) # Initialization
class Rnn(Initializable, BaseRecurrent): def __init__(self, dims=(88, 100, 100), **kwargs): super(Rnn, self).__init__(**kwargs) self.dims = dims self.input_transform = Linear(input_dim=dims[0], output_dim=dims[1], weights_init=IsotropicGaussian(0.01), # biases_init=Constant(0.0), use_bias=False, name="input_transfrom") self.gru_layer = SimpleRecurrent(dim=dims[1], activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=True, name="gru_rnn_layer") # TODO: find a way to automatically set the output dim in case of lstm vs normal rnn self.linear_trans = Linear(input_dim=dims[1], output_dim=dims[2] * 4, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=False, name="h2h_transform") self.lstm_layer = LSTM(dim=dims[2], activation=Tanh(), weights_init=IsotropicGaussian(0.01), biases_init=Constant(0.0), use_bias=True, name="lstm_rnn_layer") self.out_transform = MLP(activations=[Sigmoid()], dims=[dims[2], dims[0]], weights_init=IsotropicGaussian(0.01), use_bias=True, biases_init=Constant(0.0), name="out_layer") self.children = [self.input_transform, self.gru_layer, self.linear_trans, self.lstm_layer, self.out_transform] # @recurrent(sequences=['inputs', 'input_mask'], contexts=[], # states=['gru_state', 'lstm_state', 'lstm_cells'], # outputs=['gru_state', 'lstm_state', 'lstm_cells']) def rnn_apply(self, inputs, mask=None, gru_state=None, lstm_state=None, lstm_cells=None): input_transform = self.input_transform.apply(inputs) gru_state = self.gru_layer.apply( inputs=input_transform, # update_inputs=input_transform, # reset_inputs=input_transform, states=gru_state, mask=mask, iterate=False) lstm_transform = self.linear_trans.apply(gru_state) lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state, cells=lstm_cells, mask=mask, iterate=False) return gru_state, lstm_state, lstm_cells @recurrent(sequences=[], contexts=[], states=['inputs', 'gru_state', 'lstm_state', 'lstm_cells'], outputs=['inputs', 'gru_state', 'lstm_state', 'lstm_cells']) def rnn_generate(self, inputs=None, gru_state=None, lstm_state=None, lstm_cells=None): output = self.apply(inputs=inputs, gru_state=gru_state, lstm_state=lstm_state, lstm_cells=lstm_cells, iterate=False) return output, gru_state, lstm_state, lstm_cells @recurrent(sequences=['inputs', 'mask'], contexts=[], states=['gru_state', 'lstm_state', 'lstm_cells'], outputs=['output', 'gru_state', 'lstm_state', 'lstm_cells']) def apply(self, inputs, mask, gru_state=None, lstm_state=None, lstm_cells=None): # input_transform = self.input_transform.apply(inputs) # gru_state = self.gru_layer.apply( # inputs=input_transform, # mask=mask, # states=gru_state, # iterate=False) # lstm_transform = self.linear_trans.apply(gru_state) # lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state, # cells=lstm_cells, # mask=mask, iterate=False) gru_state, lstm_state, lstm_cells = self.rnn_apply(inputs=inputs, mask=mask, gru_state=gru_state, lstm_state=lstm_state, lstm_cells=lstm_cells) output = 1.17 * self.out_transform.apply(lstm_state) * mask[:, None] return output, gru_state, lstm_state, lstm_cells def get_dim(self, name): dims = dict(zip(['outputs', 'gru_state', 'lstm_state'], self.dims)) dims['lstm_cells'] = dims['lstm_state'] return dims.get(name, None) or super(Rnn, self).get_dim(name)
def main(save_to, num_epochs): batch_size = 128 dim = 100 n_steps = 20 i2h1 = MLP([Identity()], [784, dim], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001)) h2o1 = MLP([Rectifier(), Logistic()], [dim, dim, 784], biases_init=Constant(0.), weights_init=IsotropicGaussian(.001)) rec1 = SimpleRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) i2h1.initialize() h2o1.initialize() rec1.initialize() x = tensor.tensor3('features') x1 = x[1:, :, :] x2 = x[:-1, :, :] preproc = i2h1.apply(x1) h1 = rec1.apply(preproc) x_hat = h2o1.apply(h1) cost = tensor.nnet.binary_crossentropy(x_hat, x2).mean() # cost = CategoricalCrossEntropy().apply(y.flatten(), probs) cost.name = 'final_cost' cg = ComputationGraph([cost, ]) mnist_train = MNIST("train", subset=slice(0, 50000), sources=('features', )) mnist_valid = MNIST("train", subset=slice(50000, 60000), sources=('features',)) mnist_test = MNIST("test") trainstream = Mapping(Flatten(DataStream(mnist_train, iteration_scheme=SequentialScheme(50000, batch_size))), _meanize(n_steps)) validstream = Mapping(Flatten(DataStream(mnist_valid, iteration_scheme=SequentialScheme(10000, batch_size))), _meanize(n_steps)) teststream = Mapping(Flatten(DataStream(mnist_test, iteration_scheme=SequentialScheme(10000, batch_size))), _meanize(n_steps)) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([Adam(), StepClipping(100)])) main_loop = MainLoop( algorithm, trainstream, extensions=[Timing(), FinishAfter(after_n_epochs=num_epochs), # DataStreamMonitoring( # [cost, ], # teststream, # prefix="test"), DataStreamMonitoringAndSaving( [cost, ], validstream, [i2h1, h2o1, rec1], 'best_'+save_to+'.pkl', cost_name=cost.name, after_epoch=True, prefix='valid'), TrainingDataMonitoring( [cost, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), # Plot( # save_to, # channels=[ # ['test_final_cost', # 'test_misclassificationrate_apply_error_rate'], # ['train_total_gradient_norm']]), Printing()]) main_loop.run()
# T x B x F x = tensor.tensor3('x', dtype=floatX) # T x B x_mask = tensor.matrix('x_mask', dtype=floatX) # L x B y = tensor.matrix('y', dtype=floatX) # L x B y_mask = tensor.matrix('y_mask', dtype=floatX) x_to_h = Linear(name='x_to_h', input_dim=x_dim, output_dim=h_dim) x_transform = x_to_h.apply(x) rnn = SimpleRecurrent(activation=Tanh(), dim=h_dim, name="rnn") h = rnn.apply(x_transform) h_to_o = Linear(name='h_to_o', input_dim=h_dim, output_dim=num_classes + 1) h_transform = h_to_o.apply(h) # T x B x C+1 y_hat = tensor.nnet.softmax( h_transform.reshape((-1, num_classes + 1)) ).reshape((h.shape[0], h.shape[1], -1)) y_hat.name = 'y_hat' y_hat_mask = x_mask cost = CTC().apply(y, y_hat, y_mask, y_hat_mask, 'normal_scale') cost.name = 'CTC' # Initialization for brick in (rnn, x_to_h, h_to_o):