def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim*2],name='fork',output_names=["linear","gates"], weights_init=initialization.Identity(),biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(),biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear( input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin,gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
def __init__(self, feature_size, embedding_dim, state_dim, **kwargs): super(BidirectionalPhonemeAudioEncoder, self).__init__(**kwargs) self.feature_size = feature_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.audio_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_embeddings") self.audio_fwd_fork = Fork( [name for name in self.audio_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='audio_fwd_fork') self.audio_back_fork = Fork( [name for name in self.audio_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='audio_back_fork') self.phoneme_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="phoneme_embeddings") self.phoneme_fwd_fork = Fork( [name for name in self.phoneme_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='phoneme_fwd_fork') self.phoneme_back_fork = Fork( [name for name in self.phoneme_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='phoneme_back_fork') self.words_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="words_embeddings") self.words_fwd_fork = Fork( [name for name in self.words_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='words_fwd_fork') self.words_back_fork = Fork( [name for name in self.words_embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='words_back_fork') self.children = [self.phoneme_embedding, self.audio_embedding, self.words_embedding, self.phoneme_fwd_fork, self.phoneme_back_fork, self.audio_fwd_fork, self.audio_back_fork, self.words_fwd_fork, self.words_back_fork]
def gru_layer(dim, h, n): fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)], name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2]) gru = GatedRecurrent(dim=dim, name='gru' + str(n)) initialize([fork, gru]) linear, gates = fork.apply(h) return gru.apply(linear, gates)
def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs): super(BidirectionalPhonesEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.lookup = LookupTable(name='phones_embeddings') self.embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_embeddings") self.embedding_fwd_fork = Fork( [name for name in self.embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='embedding_fwd_fork') self.embedding_back_fork = Fork( [name for name in self.embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='embedding_back_fork') self.bidir = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_representation") self.fwd_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork') self.back_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='back_fork') self.children = [self.lookup, self.bidir, self.embedding, self.fwd_fork, self.back_fork, self.embedding_fwd_fork, self.embedding_back_fork]
def __init__(self, k=20, rec_h_dim=400, att_size=10, num_letters=68, sampling_bias=0., attention_type="graves", epsilon=1e-6, attention_alignment=1., **kwargs): super(Scribe, self).__init__(**kwargs) # For now only softmax and graves are supported. assert attention_type in ["graves", "softmax"] readouts_dim = 1 + 6 * k self.k = k self.rec_h_dim = rec_h_dim self.att_size = att_size self.num_letters = num_letters self.sampling_bias = sampling_bias self.attention_type = attention_type self.epsilon = epsilon self.attention_alignment = attention_alignment self.cell1 = GatedRecurrent(dim=rec_h_dim, name='cell1') self.inp_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'], input_dim=3, output_dims=[rec_h_dim, 2 * rec_h_dim], name='inp_to_h1') self.h1_to_readout = Linear(input_dim=rec_h_dim, output_dim=readouts_dim, name='h1_to_readout') self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'], input_dim=rec_h_dim, output_dims=[att_size] * 3, name='h1_to_att') self.att_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'], input_dim=num_letters, output_dims=[rec_h_dim, 2 * rec_h_dim], name='att_to_h1') self.att_to_readout = Linear(input_dim=num_letters, output_dim=readouts_dim, name='att_to_readout') self.emitter = BivariateGMMEmitter(k=k, sampling_bias=sampling_bias) self.children = [ self.cell1, self.inp_to_h1, self.h1_to_readout, self.h1_to_att, self.att_to_h1, self.att_to_readout, self.emitter ]
def setUp(self): self.gated = GatedRecurrent( dim=3, activation=Tanh(), gate_activation=Tanh(), weights_init=Constant(2)) self.gated.initialize() self.reset_only = GatedRecurrent( dim=3, activation=Tanh(), gate_activation=Tanh(), weights_init=IsotropicGaussian(), seed=1) self.reset_only.initialize()
def setUp(self): self.gated = GatedRecurrent( dim=3, weights_init=Constant(2), activation=Tanh(), gate_activation=Tanh()) self.gated.initialize() self.reset_only = GatedRecurrent( dim=3, weights_init=IsotropicGaussian(), activation=Tanh(), gate_activation=Tanh(), use_update_gate=False, rng=numpy.random.RandomState(1)) self.reset_only.initialize()
def gru_layer(dim, h, n): fork = Fork( output_names=["linear" + str(n), "gates" + str(n)], name="fork" + str(n), input_dim=dim, output_dims=[dim, dim * 2], ) gru = GatedRecurrent(dim=dim, name="gru" + str(n)) initialize([fork, gru]) linear, gates = fork.apply(h) return gru.apply(linear, gates)
def gru_layer(dim, h, n, x_mask, first, **kwargs): fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)], name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2]) gru = GatedRecurrent(dim=dim, name='gru' + str(n)) initialize([fork, gru]) linear, gates = fork.apply(h) if first: gruApply = gru.apply(linear, gates, mask=x_mask, **kwargs) else: gruApply = gru.apply(linear, gates, **kwargs) return gruApply
def __init__( self, encoder_type, num_characters, input_dim, encoder_dim, **kwargs): assert encoder_type in [None, 'bidirectional'] self.encoder_type = encoder_type super(Encoder, self).__init__(**kwargs) self.children = [] if encoder_type in ['lookup', 'bidirectional']: self.embed_label = LookupTable( num_characters, input_dim, name='embed_label') self.children += [ self.embed_label] else: # If there is no encoder. assert num_characters == input_dim if encoder_type == 'bidirectional': transition = RecurrentWithFork( GatedRecurrent(dim=encoder_dim).apply, input_dim, name='encoder_transition') self.encoder = Bidirectional(transition, name='encoder') self.children.append(self.encoder)
def __init__(self, embedding_dim, state_dim, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) # Dimension of the word embeddings taken as input self.embedding_dim = embedding_dim # Hidden state dimension self.state_dim = state_dim # The bidir GRU self.bidir = BidirectionalFromDict( GatedRecurrent(activation=Tanh(), dim=state_dim)) # Forks to administer the inputs of GRU gates self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [self.bidir, self.fwd_fork, self.back_fork]
def __init__(self, inner_input_dim, outer_input_dim, inner_dim, **kwargs): self.inner_gru = GatedRecurrent(dim=inner_dim, name='inner_gru') self.inner_input_fork = Fork( output_names=[name for name in self.inner_gru.apply.sequences if 'mask' not in name], input_dim=inner_input_dim, name='inner_input_fork') self.outer_input_fork = Fork( output_names=[name for name in self.inner_gru.apply.sequences if 'mask' not in name], input_dim=outer_input_dim, name='inner_outer_fork') super(InnerRecurrent, self).__init__(**kwargs) self.children = [ self.inner_gru, self.inner_input_fork, self.outer_input_fork]
def __init__(self,hidden_size_recurrent, k, **kwargs): super(Scribe, self).__init__(**kwargs) readout_size =6*k+1 transition = [GatedRecurrent(dim=hidden_size_recurrent, name = "gru_{}".format(i) ) for i in range(3)] transition = RecurrentStack( transition, name="transition", skip_connections = True) emitter = BivariateGMMEmitter(k = k) source_names = [name for name in transition.apply.states if 'states' in name] readout = Readout( readout_dim = readout_size, source_names =source_names, emitter=emitter, name="readout") self.generator = SequenceGenerator(readout=readout, transition=transition, name = "generator") self.children = [self.generator]
def __init__(self, embedding_dim, state_dim, **kwargs): """Constructor. Note that this implementation only supports single layer architectures. Args: embedding_dim (int): Dimensionality of the word vectors defined by the sparse feature map. state_dim (int): Size of the recurrent layer. """ super(NoLookupEncoder, self).__init__(**kwargs) self.embedding_dim = embedding_dim self.state_dim = state_dim self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [self.bidir, self.fwd_fork, self.back_fork]
def test_sequence_generator(): # Disclaimer: here we only check shapes, not values. output_dim = 1 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent( name="transition", activation=Tanh(), dim=dim, weights_init=Orthogonal()) generator = SequenceGenerator( LinearReadout(readout_dim=output_dim, source_names=["states"], emitter=TestEmitter(name="emitter"), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.initialize() y = tensor.tensor3('y') mask = tensor.matrix('mask') costs = generator.cost(y, mask) assert costs.ndim == 2 costs_val = theano.function([y, mask], [costs])( numpy.zeros((n_steps, batch_size, output_dim), dtype=floatX), numpy.ones((n_steps, batch_size), dtype=floatX))[0] assert costs_val.shape == (n_steps, batch_size) states, outputs, costs = [variable.eval() for variable in generator.generate( iterate=True, batch_size=batch_size, n_steps=n_steps)] assert states.shape == (n_steps, batch_size, dim) assert outputs.shape == (n_steps, batch_size, output_dim) assert costs.shape == (n_steps, batch_size)
def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.lookup = LookupTable(name='embeddings') self.GRU = GatedRecurrent(activation=Tanh(), dim=state_dim) self.children = [self.lookup, self.GRU]
class InnerRecurrent(BaseRecurrent, Initializable): def __init__(self, inner_input_dim, outer_input_dim, inner_dim, **kwargs): self.inner_gru = GatedRecurrent(dim=inner_dim, name='inner_gru') self.inner_input_fork = Fork( output_names=[name for name in self.inner_gru.apply.sequences if 'mask' not in name], input_dim=inner_input_dim, name='inner_input_fork') self.outer_input_fork = Fork( output_names=[name for name in self.inner_gru.apply.sequences if 'mask' not in name], input_dim=outer_input_dim, name='inner_outer_fork') super(InnerRecurrent, self).__init__(**kwargs) self.children = [ self.inner_gru, self.inner_input_fork, self.outer_input_fork] def _push_allocation_config(self): self.inner_input_fork.output_dims = self.inner_gru.get_dims( self.inner_input_fork.output_names) self.outer_input_fork.output_dims = self.inner_gru.get_dims( self.outer_input_fork.output_names) @recurrent(sequences=['inner_inputs'], states=['states'], contexts=['outer_inputs'], outputs=['states']) def apply(self, inner_inputs, states, outer_inputs): forked_inputs = self.inner_input_fork.apply(inner_inputs, as_dict=True) forked_states = self.outer_input_fork.apply(outer_inputs, as_dict=True) gru_inputs = {key: forked_inputs[key] + forked_states[key] for key in forked_inputs.keys()} new_states = self.inner_gru.apply( iterate=False, **dict_union(gru_inputs, {'states': states})) return new_states # mean according to the time axis def get_dim(self, name): if name == 'states': return self.inner_gru.get_dim(name) else: return AttributeError
def __init__(self, hidden_dim, activation=None, gate_activation=None, state_to_state_init=None, state_to_update_init=None, state_to_reset_init=None, input_to_state_transform=None, input_to_update_transform=None, input_to_reset_transform=None, **kwargs): super(GatedRecurrentFull, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.state_to_state_init = state_to_state_init self.state_to_update_init = state_to_update_init self.state_to_reset_init = state_to_reset_init self.input_to_state_transform = input_to_state_transform self.input_to_update_transform = input_to_update_transform self.input_to_reset_transform = input_to_reset_transform self.input_to_state_transform.name += "_input_to_state_transform" self.input_to_update_transform.name += "_input_to_update_transform" self.input_to_reset_transform.name += "_input_to_reset_transform" self.use_mine = True if self.use_mine: self.rnn = GatedRecurrentFast(weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) else: self.rnn = GatedRecurrent(weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) self.children = [ self.rnn, self.input_to_state_transform, self.input_to_update_transform, self.input_to_reset_transform ] self.children.extend(self.rnn.children)
def __init__(self, vocab_size, embedding_dim, state_dim, reverse=True, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.reverse = reverse self.lookup = LookupTable(name='embeddings') self.transition = GatedRecurrent(Tanh(), name='encoder_transition') self.fork = Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear()) self.children = [self.lookup, self.transition, self.fork]
class GatedRecurrentWithContext(Initializable): def __init__(self, *args, **kwargs): self.gated_recurrent = GatedRecurrent(*args, **kwargs) self.children = [self.gated_recurrent] @application(states=['states'], outputs=['states'], contexts=[ 'readout_context', 'transition_context', 'update_context', 'reset_context' ]) def apply(self, transition_context, update_context, reset_context, *args, **kwargs): kwargs['inputs'] += transition_context kwargs['update_inputs'] += update_context kwargs['reset_inputs'] += reset_context # readout_context was only added for the Readout brick, discard it kwargs.pop('readout_context') return self.gated_recurrent.apply(*args, **kwargs) def get_dim(self, name): if name in [ 'readout_context', 'transition_context', 'update_context', 'reset_context' ]: return self.dim return self.gated_recurrent.get_dim(name) def __getattr__(self, name): if name == 'gated_recurrent': raise AttributeError return getattr(self.gated_recurrent, name) @apply.property('sequences') def apply_inputs(self): sequences = ['mask', 'inputs'] if self.use_update_gate: sequences.append('update_inputs') if self.use_reset_gate: sequences.append('reset_inputs') return sequences
def example2(): """GRU""" x = tensor.tensor3('x') dim = 3 fork = Fork(input_dim=dim, output_dims=[dim, dim * 2], name='fork', output_names=["linear", "gates"], weights_init=initialization.Identity(), biases_init=Constant(0)) gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(), biases_init=Constant(0)) fork.initialize() gru.initialize() linear, gate_inputs = fork.apply(x) h = gru.apply(linear, gate_inputs) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) doubler = Linear(input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2), biases_init=initialization.Constant(0)) doubler.initialize() lin, gate = fork.apply(doubler.apply(x)) h_doubler = gru.apply(lin, gate) f = theano.function([x], h_doubler) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
def __init__(self, vocab_size, embedding_dim, state_dim, reverse=True, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.reverse = reverse self.lookup = LookupTable(name='embeddings') self.transition = GatedRecurrent(Tanh(), name='encoder_transition') self.fork = Fork([name for name in self.transition.apply.sequences if name != 'mask'], prototype=Linear()) self.children = [self.lookup, self.transition, self.fork]
def __init__(self, blockid, vocab_size, embedding_dim, state_dim, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.blockid = blockid self.lookup = LookupTable(name='embeddings' + '_' + self.blockid) self.gru = GatedRecurrent(activation=Tanh(), dim=state_dim, name = "GatedRNN" + self.blockid) self.fwd_fork = Fork( [name for name in self.gru.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork' + '_' + self.blockid) self.children = [self.lookup, self.gru, self.fwd_fork]
class Encoder(Initializable): """Encoder of RNNsearch model.""" def __init__(self, blockid, vocab_size, embedding_dim, state_dim, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.blockid = blockid self.lookup = LookupTable(name='embeddings' + '_' + self.blockid) self.gru = GatedRecurrent(activation=Tanh(), dim=state_dim, name = "GatedRNN" + self.blockid) self.fwd_fork = Fork( [name for name in self.gru.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork' + '_' + self.blockid) self.children = [self.lookup, self.gru, self.fwd_fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.fwd_fork.input_dim = self.embedding_dim self.fwd_fork.output_dims = [self.gru.get_dim(name) for name in self.fwd_fork.output_names] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) grupara = merge( self.fwd_fork.apply(embeddings, as_dict=True) , {'mask': source_sentence_mask}) representation = self.gru.apply(**grupara) return representation
def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections, state_dim, **kwargs): """Sole constructor. Args: vocab_size (int): Source vocabulary size embedding_dim (int): Dimension of the embedding layer n_layers (int): Number of layers. Layers share the same weight matrices. skip_connections (bool): Skip connections connect the source word embeddings directly with deeper layers to propagate the gradient more efficiently state_dim (int): Number of hidden units in the recurrent layers. """ super(DeepBidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.n_layers = n_layers self.state_dim = state_dim self.skip_connections = skip_connections self.lookup = LookupTable(name='embeddings') self.bidirs = [] self.fwd_forks = [] self.back_forks = [] for i in xrange(self.n_layers): bidir = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name='bidir%d' % i) self.bidirs.append(bidir) self.fwd_forks.append( Fork([ name for name in bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork%d' % i)) self.back_forks.append( Fork([ name for name in bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork%d' % i)) self.children = [self.lookup] \ + self.bidirs \ + self.fwd_forks \ + self.back_forks
class GatedRecurrentWithContext(Initializable): def __init__(self, *args, **kwargs): self.gated_recurrent = GatedRecurrent(*args, **kwargs) self.children = [self.gated_recurrent] @application(states=['states'], outputs=['states'], contexts=['readout_context', 'transition_context', 'update_context', 'reset_context']) def apply(self, transition_context, update_context, reset_context, *args, **kwargs): kwargs['inputs'] += transition_context kwargs['update_inputs'] += update_context kwargs['reset_inputs'] += reset_context # readout_context was only added for the Readout brick, discard it kwargs.pop('readout_context') return self.gated_recurrent.apply(*args, **kwargs) def get_dim(self, name): if name in ['readout_context', 'transition_context', 'update_context', 'reset_context']: return self.dim return self.gated_recurrent.get_dim(name) def __getattr__(self, name): if name == 'gated_recurrent': raise AttributeError return getattr(self.gated_recurrent, name) @apply.property('sequences') def apply_inputs(self): sequences = ['mask', 'inputs'] if self.use_update_gate: sequences.append('update_inputs') if self.use_reset_gate: sequences.append('reset_inputs') return sequences
def __init__(self, dimension, input_size, embed_input=False, **kwargs): super(GRUEncoder, self).__init__(**kwargs) if embed_input: self.embedder = LookupTable(input_size, dimension) else: self.embedder = Linear(input_size, dimension) self.fork = Fork(['inputs', 'gate_inputs'], dimension, output_dims=[dimension, 2 * dimension], prototype=Linear()) encoder = Bidirectional( GatedRecurrent(dim=dimension, activation=Tanh())) self.encoder = encoder self.children = [encoder, self.embedder, self.fork]
def __init__(self, hidden_dim, activation=None, gate_activation=None, state_to_state_init=None, state_to_update_init=None, state_to_reset_init=None, input_to_state_transform=None, input_to_update_transform=None, input_to_reset_transform=None, **kwargs): super(GatedRecurrentFull, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.state_to_state_init = state_to_state_init self.state_to_update_init = state_to_update_init self.state_to_reset_init = state_to_reset_init self.input_to_state_transform = input_to_state_transform self.input_to_update_transform = input_to_update_transform self.input_to_reset_transform = input_to_reset_transform self.input_to_state_transform.name += "_input_to_state_transform" self.input_to_update_transform.name += "_input_to_update_transform" self.input_to_reset_transform.name += "_input_to_reset_transform" self.use_mine = True if self.use_mine: self.rnn = GatedRecurrentFast( weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) else: self.rnn = GatedRecurrent( weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) self.children = [self.rnn, self.input_to_state_transform, self.input_to_update_transform, self.input_to_reset_transform] self.children.extend(self.rnn.children)
class Encoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, reverse=True, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.reverse = reverse self.lookup = LookupTable(name='embeddings') self.transition = GatedRecurrent(Tanh(), name='encoder_transition') self.fork = Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear()) self.children = [self.lookup, self.transition, self.fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.transition.dim = self.state_dim self.fork.input_dim = self.embedding_dim self.fork.output_dims = [ self.state_dim for _ in self.fork.output_names ] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.dimshuffle(1, 0) source_sentence_mask = source_sentence_mask.T if self.reverse: source_sentence = source_sentence[::-1] source_sentence_mask = source_sentence_mask[::-1] embeddings = self.lookup.apply(source_sentence) representation = self.transition.apply( **merge(self.fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask})) return representation[-1]
def test_integer_sequence_generator(): # Disclaimer: here we only check shapes, not values. readout_dim = 5 feedback_dim = 3 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim, weights_init=Orthogonal()) generator = SequenceGenerator(LinearReadout( readout_dim=readout_dim, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(readout_dim, feedback_dim), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.initialize() y = tensor.lmatrix('y') mask = tensor.matrix('mask') costs = generator.cost(y, mask) assert costs.ndim == 2 costs_val = theano.function([y, mask], [costs])(numpy.zeros((n_steps, batch_size), dtype='int64'), numpy.ones((n_steps, batch_size), dtype=floatX))[0] assert costs_val.shape == (n_steps, batch_size) states, outputs, costs = generator.generate(iterate=True, batch_size=batch_size, n_steps=n_steps) states_val, outputs_val, costs_val = theano.function( [], [states, outputs, costs], updates=costs.owner.inputs[0].owner.tag.updates)() assert states_val.shape == (n_steps, batch_size, dim) assert outputs_val.shape == (n_steps, batch_size) assert outputs_val.dtype == 'int64' assert costs_val.shape == (n_steps, batch_size)
def __init__(self, src_vocab_size, embedding_dim, dgru_state_dim, state_dim, src_dgru_depth, bidir_encoder_depth, **kwargs): super(BidirectionalEncoder, self).__init__(**kwargs) self.state_dim = state_dim self.dgru_state_dim = dgru_state_dim self.decimator = Decimator(src_vocab_size, embedding_dim, dgru_state_dim, src_dgru_depth) self.bidir = Bidirectional(RecurrentWithFork(GatedRecurrent( activation=Tanh(), dim=state_dim), dgru_state_dim, name='with_fork'), name='bidir0') self.children = [self.decimator, self.bidir] for layer_n in range(1, bidir_encoder_depth): self.children.append(copy.deepcopy(self.bidir)) for child in self.children[-1].children: child.input_dim = 2 * state_dim self.children[-1].name = 'bidir{}'.format(layer_n)
class Encoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, reverse=True, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.reverse = reverse self.lookup = LookupTable(name='embeddings') self.transition = GatedRecurrent(Tanh(), name='encoder_transition') self.fork = Fork([name for name in self.transition.apply.sequences if name != 'mask'], prototype=Linear()) self.children = [self.lookup, self.transition, self.fork] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.transition.dim = self.state_dim self.fork.input_dim = self.embedding_dim self.fork.output_dims = [self.state_dim for _ in self.fork.output_names] @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): # Time as first dimension source_sentence = source_sentence.dimshuffle(1, 0) source_sentence_mask = source_sentence_mask.T if self.reverse: source_sentence = source_sentence[::-1] source_sentence_mask = source_sentence_mask[::-1] embeddings = self.lookup.apply(source_sentence) representation = self.transition.apply(**merge( self.fork.apply(embeddings, as_dict=True), {'mask': source_sentence_mask} )) return representation[-1]
def __init__(self, base_encoder, state_dim=1000, self_attendable=False, **kwargs): """Constructor. Args: base_encoder (Brick): Low level encoder network which produces annotations to attend to state_dim (int): Size of the recurrent layer. self_attendable (bool): If true, the annotator can attend to its own previous states. If false it can only attend to base annotations """ super(HierarchicalAnnotator, self).__init__(**kwargs) self.state_dim = state_dim * 2 self.base_encoder = base_encoder self.self_attendable = self_attendable trans_core = GatedRecurrent(activation=Tanh(), dim=self.state_dim) if self_attendable: self.attention = SelfAttendableContentAttention( state_names=trans_core.apply.states, attended_dim=self.state_dim, match_dim=self.state_dim, num_steps=10, name="hier_attention") else: self.attention = SequenceContentAttention( state_names=trans_core.apply.states, attended_dim=self.state_dim, match_dim=self.state_dim, name="hier_attention") self.transition = AttentionRecurrent(trans_core, self.attention, name="hier_att_trans") self.children = [self.transition]
class Encoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs): super(Encoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.lookup = LookupTable(name='embeddings') self.GRU = GatedRecurrent(activation=Tanh(), dim=state_dim) self.children = [self.lookup, self.GRU] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim @application(inputs=['source_sentence', 'source_sentence_mask'], outputs=['representation']) def apply(self, source_sentence, source_sentence_mask): source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) representation = self.GRU.apply(embeddings, embeddings) return representation
class TestGatedRecurrent(unittest.TestCase): def setUp(self): self.gated = GatedRecurrent( dim=3, activation=Tanh(), gate_activation=Tanh(), weights_init=Constant(2)) self.gated.initialize() self.reset_only = GatedRecurrent( dim=3, activation=Tanh(), gate_activation=Tanh(), weights_init=IsotropicGaussian(), seed=1) self.reset_only.initialize() def test_one_step(self): h0 = tensor.matrix('h0') x = tensor.matrix('x') gi = tensor.matrix('gi') h1 = self.gated.apply(x, gi, h0, iterate=False) next_h = theano.function(inputs=[h0, x, gi], outputs=[h1]) h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=theano.config.floatX) x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]], dtype=theano.config.floatX) zi_val = (h0_val + x_val) / 2 ri_val = -x_val W_val = 2 * numpy.ones((3, 3), dtype=theano.config.floatX) z_val = numpy.tanh(h0_val.dot(W_val) + zi_val) r_val = numpy.tanh(h0_val.dot(W_val) + ri_val) h1_val = (z_val * numpy.tanh((r_val * h0_val).dot(W_val) + x_val) + (1 - z_val) * h0_val) assert_allclose( h1_val, next_h(h0_val, x_val, numpy.hstack([zi_val, ri_val]))[0], rtol=1e-6) def test_many_steps(self): x = tensor.tensor3('x') gi = tensor.tensor3('gi') mask = tensor.matrix('mask') h = self.reset_only.apply(x, gi, mask=mask) calc_h = theano.function(inputs=[x, gi, mask], outputs=[h]) x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = numpy.ones((24, 4, 3), dtype=theano.config.floatX) * x_val[..., None] ri_val = 0.3 - x_val zi_val = 2 * ri_val mask_val = numpy.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX) W = self.reset_only.state_to_state.get_value() Wz = self.reset_only.state_to_gates.get_value()[:, :3] Wr = self.reset_only.state_to_gates.get_value()[:, 3:] for i in range(1, 25): z_val = numpy.tanh(h_val[i - 1].dot(Wz) + zi_val[i - 1]) r_val = numpy.tanh(h_val[i - 1].dot(Wr) + ri_val[i - 1]) h_val[i] = numpy.tanh((r_val * h_val[i - 1]).dot(W) + x_val[i - 1]) h_val[i] = z_val * h_val[i] + (1 - z_val) * h_val[i - 1] h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) h_val = h_val[1:] # TODO Figure out why this tolerance needs to be so big assert_allclose( h_val, calc_h(x_val, numpy.concatenate( [zi_val, ri_val], axis=2), mask_val)[0], 1e-04) # Also test that initial state is a parameter initial_state, = VariableFilter(roles=[INITIAL_STATE])( ComputationGraph(h)) assert is_shared_variable(initial_state) assert initial_state.name == 'initial_state'
def test_integer_sequence_generator(): """Test a sequence generator with integer outputs. Such sequence generators can be used to e.g. model language. """ rng = numpy.random.RandomState(1234) readout_dim = 5 feedback_dim = 3 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) generator = SequenceGenerator( Readout(readout_dim=readout_dim, source_names=["states"], emitter=SoftmaxEmitter(theano_seed=1234), feedback_brick=LookupFeedback(readout_dim, feedback_dim)), transition, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.lmatrix('y') mask = tensor.matrix('mask') costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 costs_fun = theano.function([y, mask], [costs]) y_test = rng.randint(readout_dim, size=(n_steps, batch_size)) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = costs_fun(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 482.827, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], [cost])(y_test, m_test) assert_allclose(cost_val, 16.0942, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join([generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [el for el in var_filter(cg.variables) if el.name == aux_var_name][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 1.60942, rtol=1e-5) # Test generate states, outputs, costs = generator.generate( iterate=True, batch_size=batch_size, n_steps=n_steps) cg = ComputationGraph(states + outputs + costs) states_val, outputs_val, costs_val = theano.function( [], [states, outputs, costs], updates=cg.updates)() assert states_val.shape == (n_steps, batch_size, dim) assert outputs_val.shape == (n_steps, batch_size) assert outputs_val.dtype == 'int64' assert costs_val.shape == (n_steps, batch_size) assert_allclose(states_val.sum(), -17.91811, rtol=1e-5) assert_allclose(costs_val.sum(), 482.863, rtol=1e-5) assert outputs_val.sum() == 630 # Test masks agnostic results of cost cost1 = costs_fun([[1], [2]], [[1], [1]])[0] cost2 = costs_fun([[3, 1], [4, 2], [2, 0]], [[1, 1], [1, 1], [1, 0]])[0] assert_allclose(cost1.sum(), cost2[:, 1].sum(), rtol=1e-5)
def __init__(self, vocab_size, embedding_dim, dgru_state_dim, igru_state_dim, state_dim, representation_dim, transition_depth, trg_igru_depth, trg_dgru_depth, trg_space_idx, trg_bos, theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.igru_state_dim = igru_state_dim self.state_dim = state_dim self.trg_space_idx = trg_space_idx self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = RecurrentStack([ GRUInitialState(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder_gru_withinit') ] + [ GatedRecurrent( dim=state_dim, activation=Tanh(), name='decoder_gru' + str(i)) for i in range(1, transition_depth) ], skip_connections=False) # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") self.interpolator = Interpolator( vocab_size=vocab_size, embedding_dim=embedding_dim, igru_state_dim=igru_state_dim, igru_depth=trg_igru_depth, trg_dgru_depth=trg_dgru_depth, source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=trg_bos, theano_seed=theano_seed), feedback_brick=TargetWordEncoder(vocab_size, embedding_dim, self.dgru_state_dim, trg_dgru_depth)) # Build sequence generator accordingly self.sequence_generator = SequenceGeneratorDCNMT( trg_space_idx=self.trg_space_idx, readout=self.interpolator, transition=self.transition, attention=self.attention, transition_depth=transition_depth, igru_depth=trg_igru_depth, trg_dgru_depth=trg_dgru_depth, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections, state_dim, **kwargs): """Sole constructor. Args: vocab_size (int): Source vocabulary size embedding_dim (int): Dimension of the embedding layer n_layers (int): Number of layers. Layers share the same weight matrices. skip_connections (bool): Skip connections connect the source word embeddings directly with deeper layers to propagate the gradient more efficiently state_dim (int): Number of hidden units in the recurrent layers. """ super(BidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.n_layers = n_layers self.state_dim = state_dim self.skip_connections = skip_connections self.lookup = LookupTable(name='embeddings') if self.n_layers >= 1: self.bidir = BidirectionalWMT15( GatedRecurrent(activation=Tanh(), dim=state_dim)) self.fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork') self.back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork') self.children = [ self.lookup, self.bidir, self.fwd_fork, self.back_fork ] if self.n_layers > 1: # Deep encoder self.mid_fwd_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='mid_fwd_fork') self.mid_back_fork = Fork([ name for name in self.bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='mid_back_fork') self.children.append(self.mid_fwd_fork) self.children.append(self.mid_back_fork) elif self.n_layers == 0: self.embedding_dim = state_dim * 2 self.children = [self.lookup] else: logging.fatal("Number of encoder layers must be non-negative")
def main(mode, save_path, steps, time_budget, reset): num_states = ChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator( LinearReadout(readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( ChainDataset.entropy)) logger.info("Expected min error: {}".format( -ChainDataset.entropy * seq_len * batch_size)) if os.path.isfile(save_path) and not reset: model = Pylearn2Model.load(save_path) else: model = Pylearn2Model(generator) # Build the cost computation graph. # Note: would be probably nicer to make cost part of the model. x = tensor.ltensor3('x') cost = Pylearn2Cost(model.brick.cost(x[:, :, 0]).sum()) dataset = ChainDataset(rng, seq_len) sgd = SGD(learning_rate=0.0001, cost=cost, batch_size=batch_size, batches_per_iter=10, monitoring_dataset=dataset, monitoring_batch_size=batch_size, monitoring_batches=1, learning_rule=Pylearn2LearningRule( SGDLearningRule(), dict(training_objective=cost.cost))) train = Pylearn2Train(dataset, model, algorithm=sgd, save_path=save_path, save_freq=10) train.main_loop(time_budget=time_budget) elif mode == "sample": model = Pylearn2Model.load(save_path) generator = model.brick sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, ChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, ChainDataset.trans_prob)) else: assert False
def __init__(self, *args, **kwargs): self.gated_recurrent = GatedRecurrent(*args, **kwargs) self.children = [self.gated_recurrent]
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of generating a Markov chain with RNN.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "sample"], help="The mode to run. Use `train` to train a new model" " and `sample` to sample a sequence generated by an" " existing one.") parser.add_argument( "prefix", default="sine", help="The prefix for model, timing and state files") parser.add_argument( "--steps", type=int, default=100, help="Number of steps to plot") args = parser.parse_args() dim = 10 num_states = ChainIterator.num_states feedback_dim = 8 transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator( LinearReadout(readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.allocate() logger.debug("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) if args.mode == "train": rng = numpy.random.RandomState(1) batch_size = 50 generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() logger.debug("transition.weights_init={}".format( transition.weights_init)) cost = generator.cost(tensor.lmatrix('x')).sum() gh_model = GroundhogModel(generator, cost) state = GroundhogState(args.prefix, batch_size, learning_rate=0.0001).as_dict() data = ChainIterator(rng, 100, batch_size) trainer = SGD(gh_model, state, data) main_loop = MainLoop(data, None, None, gh_model, trainer, state, None) main_loop.main() elif args.mode == "sample": load_params(generator, args.prefix + "model.npz") sample = ComputationGraph(generator.generate( n_steps=args.steps, batch_size=1, iterate=True)).function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, ChainIterator.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, ChainIterator.trans_prob)) else: assert False
def main(mode, save_path, steps, num_batches): num_states = MarkovChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( MarkovChainDataset.entropy)) logger.info("Expected min error: {}".format( -MarkovChainDataset.entropy * seq_len)) # Build the cost computation graph. x = tensor.lmatrix('data') cost = aggregation.mean(generator.cost_matrix(x[:, :]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" algorithm = GradientDescent( cost=cost, params=list(Selector(generator).get_params().values()), step_rule=Scale(0.001)) main_loop = MainLoop( algorithm=algorithm, data_stream=DataStream( MarkovChainDataset(rng, seq_len), iteration_scheme=ConstantScheme(batch_size)), model=Model(cost), extensions=[FinishAfter(after_n_batches=num_batches), TrainingDataMonitoring([cost], prefix="this_step", after_batch=True), TrainingDataMonitoring([cost], prefix="average", every_n_batches=100), Checkpoint(save_path, every_n_batches=500), Printing(every_n_batches=100)]) main_loop.run() elif mode == "sample": main_loop = cPickle.load(open(save_path, "rb")) generator = main_loop.model sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(theano.config.floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, MarkovChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=theano.config.floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, MarkovChainDataset.trans_prob)) else: assert False
# Parameters n_u = 225 # input vector size (not time at this point) n_y = 225 # output vector size n_h = 500 # numer of hidden units iteration = 300 # number of epochs of gradient descent print "Building Model" # Symbolic variables x = tensor.tensor3('x', dtype=floatX) target = tensor.tensor3('target', dtype=floatX) # Build the model linear = Linear(input_dim = n_u, output_dim = n_h, name="first_layer") rnn = GatedRecurrent(dim=n_h, activation=Tanh()) linear2 = Linear(input_dim = n_h, output_dim = n_y, name="output_layer") sigm = Sigmoid() x_transform = linear.apply(x) h = rnn.apply(x_transform) predict = sigm.apply(linear2.apply(h)) # only for generation B x h_dim h_initial = tensor.tensor3('h_initial', dtype=floatX) h_testing = rnn.apply(x_transform, h_initial, iterate=False) y_hat_testing = linear2.apply(h_testing) y_hat_testing = sigm.apply(y_hat_testing) y_hat_testing.name = 'y_hat_testing'
class Parrot(Initializable, Random): def __init__( self, input_dim=420, # Dimension of the text labels output_dim=63, # Dimension of vocoder fram rnn_h_dim=1024, # Size of rnn hidden state readouts_dim=1024, # Size of readouts (summary of rnn) weak_feedback=False, # Feedback to the top rnn layer full_feedback=False, # Feedback to all rnn layers feedback_noise_level=None, # Amount of noise in feedback layer_norm=False, # Use simple normalization? use_speaker=False, # Condition on the speaker id? num_speakers=21, # How many speakers there are? speaker_dim=128, # Size of speaker embedding which_cost='MSE', # Train with MSE or GMM k_gmm=20, # How many components in the GMM sampling_bias=0, # Make samples more likely (Graves13) epsilon=1e-5, # Numerical stabilities num_characters=43, # how many chars in the labels attention_type='graves', # graves or softmax attention_size=10, # number of gaussians in the attention attention_alignment=1., # audio steps per letter at initialization sharpening_coeff=1., timing_coeff=1., encoder_type=None, encoder_dim=128, raw_output=False, **kwargs): super(Parrot, self).__init__(**kwargs) self.input_dim = input_dim self.output_dim = output_dim self.rnn_h_dim = rnn_h_dim self.readouts_dim = readouts_dim self.layer_norm = layer_norm self.which_cost = which_cost self.use_speaker = use_speaker self.full_feedback = full_feedback self.feedback_noise_level = feedback_noise_level self.epsilon = epsilon self.num_characters = num_characters self.attention_type = attention_type self.attention_alignment = attention_alignment self.attention_size = attention_size self.sharpening_coeff = sharpening_coeff self.timing_coeff = timing_coeff self.encoder_type = encoder_type self.encoder_dim = encoder_dim self.encoded_input_dim = input_dim self.raw_output = raw_output if self.encoder_type == 'bidirectional': self.encoded_input_dim = 2 * encoder_dim if self.feedback_noise_level is not None: self.noise_level_var = tensor.scalar('feedback_noise_level') self.rnn1 = GatedRecurrent(dim=rnn_h_dim, name='rnn1') self.rnn2 = GatedRecurrent(dim=rnn_h_dim, name='rnn2') self.rnn3 = GatedRecurrent(dim=rnn_h_dim, name='rnn3') self.h1_to_readout = Linear( input_dim=rnn_h_dim, output_dim=readouts_dim, name='h1_to_readout') self.h2_to_readout = Linear( input_dim=rnn_h_dim, output_dim=readouts_dim, name='h2_to_readout') self.h3_to_readout = Linear( input_dim=rnn_h_dim, output_dim=readouts_dim, name='h3_to_readout') self.h1_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h2') self.h1_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h3') self.h2_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h2_to_h3') if which_cost == 'MSE': self.readout_to_output = Linear( input_dim=readouts_dim, output_dim=output_dim, name='readout_to_output') elif which_cost == 'GMM': self.sampling_bias = sampling_bias self.k_gmm = k_gmm self.readout_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=readouts_dim, output_dims=[output_dim * k_gmm, output_dim * k_gmm, k_gmm], name='readout_to_output') self.encoder = Encoder( encoder_type, num_characters, input_dim, encoder_dim, name='encoder') self.children = [ self.encoder, self.rnn1, self.rnn2, self.rnn3, self.h1_to_readout, self.h2_to_readout, self.h3_to_readout, self.h1_to_h2, self.h1_to_h3, self.h2_to_h3, self.readout_to_output] self.inp_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h1') self.inp_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h2') self.inp_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h3') self.children += [ self.inp_to_h1, self.inp_to_h2, self.inp_to_h3] self.h1_to_att = Fork( output_names=['alpha', 'beta', 'kappa'], input_dim=rnn_h_dim, output_dims=[attention_size] * 3, name='h1_to_att') self.att_to_readout = Linear( input_dim=self.encoded_input_dim, output_dim=readouts_dim, name='att_to_readout') self.children += [ self.h1_to_att, self.att_to_readout] if use_speaker: self.num_speakers = num_speakers self.speaker_dim = speaker_dim self.embed_speaker = LookupTable(num_speakers, speaker_dim) self.speaker_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h1') self.speaker_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h2') self.speaker_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h3') self.speaker_to_readout = Linear( input_dim=speaker_dim, output_dim=readouts_dim, name='speaker_to_readout') if which_cost == 'MSE': self.speaker_to_output = Linear( input_dim=speaker_dim, output_dim=output_dim, name='speaker_to_output') elif which_cost == 'GMM': self.speaker_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=speaker_dim, output_dims=[ output_dim * k_gmm, output_dim * k_gmm, k_gmm], name='speaker_to_output') self.children += [ self.embed_speaker, self.speaker_to_h1, self.speaker_to_h2, self.speaker_to_h3, self.speaker_to_readout, self.speaker_to_output] if full_feedback: self.out_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h2') self.out_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h3') self.children += [ self.out_to_h2, self.out_to_h3] weak_feedback = True self.weak_feedback = weak_feedback if weak_feedback: self.out_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h1') self.children += [ self.out_to_h1] if self.raw_output: self.sampleRnn = SampleRnn() self.children += [self.sampleRnn] def _allocate(self): self.initial_w = shared_floatx_zeros( (self.encoded_input_dim,), name="initial_w") add_role(self.initial_w, INITIAL_STATE) def symbolic_input_variables(self): features = tensor.tensor3('features') features_mask = tensor.matrix('features_mask') labels = tensor.imatrix('labels') labels_mask = tensor.matrix('labels_mask') start_flag = tensor.scalar('start_flag') if self.use_speaker: speaker = tensor.imatrix('speaker_index') else: speaker = None if self.raw_output: raw_sequence = tensor.itensor3('raw_audio') else: raw_sequence = None return features, features_mask, labels, labels_mask, \ speaker, start_flag, raw_sequence def initial_states(self, batch_size): initial_h1 = self.rnn1.initial_states(batch_size) initial_h2 = self.rnn2.initial_states(batch_size) initial_h3 = self.rnn3.initial_states(batch_size) last_h1 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) last_h2 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) last_h3 = shared_floatx_zeros((batch_size, self.rnn_h_dim)) # Defining for all initial_k = tensor.zeros( (batch_size, self.attention_size), dtype=floatX) last_k = shared_floatx_zeros((batch_size, self.attention_size)) # Trainable initial state for w. Why not for k? initial_w = tensor.repeat(self.initial_w[None, :], batch_size, 0) last_w = shared_floatx_zeros((batch_size, self.encoded_input_dim)) return initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k @application def compute_cost( self, features, features_mask, labels, labels_mask, speaker, start_flag, batch_size, raw_audio=None): if speaker is None: assert not self.use_speaker target_features = features[1:] mask = features_mask[1:] cell_shape = (mask.shape[0], batch_size, self.rnn_h_dim) gat_shape = (mask.shape[0], batch_size, 2 * self.rnn_h_dim) cell_h1 = tensor.zeros(cell_shape, dtype=floatX) cell_h2 = tensor.zeros(cell_shape, dtype=floatX) cell_h3 = tensor.zeros(cell_shape, dtype=floatX) gat_h1 = tensor.zeros(gat_shape, dtype=floatX) gat_h2 = tensor.zeros(gat_shape, dtype=floatX) gat_h3 = tensor.zeros(gat_shape, dtype=floatX) if self.weak_feedback: input_features = features[:-1] if self.feedback_noise_level: noise = self.theano_rng.normal( size=input_features.shape, avg=0., std=1.) input_features += self.noise_level_var * noise out_cell_h1, out_gat_h1 = self.out_to_h1.apply(input_features) to_normalize = [ out_cell_h1, out_gat_h1] out_cell_h1, out_gat_h1 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 += out_cell_h1 gat_h1 += out_gat_h1 if self.full_feedback: assert self.weak_feedback out_cell_h2, out_gat_h2 = self.out_to_h2.apply(input_features) out_cell_h3, out_gat_h3 = self.out_to_h3.apply(input_features) to_normalize = [ out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3] out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h2 += out_cell_h2 gat_h2 += out_gat_h2 cell_h3 += out_cell_h3 gat_h3 += out_gat_h3 if self.use_speaker: speaker = speaker[:, 0] emb_speaker = self.embed_speaker.apply(speaker) emb_speaker = tensor.shape_padleft(emb_speaker) spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker) spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker) spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker) to_normalize = [ spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3, spk_gat_h3] spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \ spk_cell_h3, spk_gat_h3, = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 = spk_cell_h1 + cell_h1 cell_h2 = spk_cell_h2 + cell_h2 cell_h3 = spk_cell_h3 + cell_h3 gat_h1 = spk_gat_h1 + gat_h1 gat_h2 = spk_gat_h2 + gat_h2 gat_h3 = spk_gat_h3 + gat_h3 initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k = \ self.initial_states(batch_size) # If it's a new example, use initial states. input_h1 = tensor.switch( start_flag, initial_h1, last_h1) input_h2 = tensor.switch( start_flag, initial_h2, last_h2) input_h3 = tensor.switch( start_flag, initial_h3, last_h3) input_w = tensor.switch( start_flag, initial_w, last_w) input_k = tensor.switch( start_flag, initial_k, last_k) context_oh = self.encoder.apply(labels) * \ tensor.shape_padright(labels_mask) u = tensor.shape_padleft( tensor.arange(labels.shape[1], dtype=floatX), 2) def step( inp_h1_t, gat_h1_t, inp_h2_t, gat_h2_t, inp_h3_t, gat_h3_t, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, context_oh): attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) inp_h1_t += attinp_h1 gat_h1_t += attgat_h1 h1_t = self.rnn1.apply( inp_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) + self.epsilon k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t) a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum( a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) inp_h2_t += attinp_h2 gat_h2_t += attgat_h2 inp_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [ h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply( inp_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [ h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply( inp_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) return h1_t, h2_t, h3_t, k_t, w_t, phi_t, a_t_ (h1, h2, h3, k, w, phi, pi_att), scan_updates = theano.scan( fn=step, sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3], non_sequences=[context_oh], outputs_info=[ input_h1, input_h2, input_h3, input_k, input_w, None, None]) h1_out = self.h1_to_readout.apply(h1) h2_out = self.h2_to_readout.apply(h2) h3_out = self.h3_to_readout.apply(h3) to_normalize = [ h1_out, h2_out, h3_out] h1_out, h2_out, h3_out = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] readouts = h1_out + h2_out + h3_out if self.use_speaker: readouts += self.speaker_to_readout.apply(emb_speaker) readouts += self.att_to_readout.apply(w) predicted = self.readout_to_output.apply(readouts) if self.which_cost == 'MSE': if self.use_speaker: predicted += self.speaker_to_output.apply(emb_speaker) cost = tensor.sum((predicted - target_features) ** 2, axis=-1) next_x = predicted # Dummy value for coeff coeff = predicted elif self.which_cost == 'GMM': mu, sigma, coeff = predicted if self.use_speaker: spk_to_out = self.speaker_to_output.apply(emb_speaker) mu += spk_to_out[0] sigma += spk_to_out[1] coeff += spk_to_out[2] # When training there should not be sampling_bias sigma = tensor.exp(sigma) + self.epsilon coeff = tensor.nnet.softmax( coeff.reshape( (-1, self.k_gmm))).reshape( coeff.shape) + self.epsilon cost = cost_gmm(target_features, mu, sigma, coeff) next_x = sample_gmm(mu, sigma, coeff, self.theano_rng) cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag updates = [] updates.append((last_h1, h1[-1])) updates.append((last_h2, h2[-1])) updates.append((last_h3, h3[-1])) updates.append((last_k, k[-1])) updates.append((last_w, w[-1])) cost_raw = None if self.raw_output: raw_mask = tensor.extra_ops.repeat(features_mask, 80, axis=0) raw_mask = raw_mask.dimshuffle(1, 0) # breakpointOp = PdbBreakpoint("Raw mask breakpoint") # condition = tensor.gt(raw_mask.shape[0], 0) # raw_mask = breakpointOp(condition, raw_mask) predicted_transposed = predicted.dimshuffle(1, 0, 2) last_h0, last_big_h0 = self.sampleRnn.initial_states(batch_size) raw_audio_reshaped = raw_audio.dimshuffle(1, 0, 2) raw_audio_reshaped = raw_audio_reshaped.reshape((raw_audio_reshaped.shape[0], -1)) cost_raw, ip_cost, all_params, ip_params, other_params, new_h0, new_big_h0 =\ self.sampleRnn.apply(raw_audio_reshaped, predicted_transposed, last_h0, last_big_h0, start_flag, raw_mask) if self.sampleRnn.N_RNN == 1: new_h0 = tensor.unbroadcast(new_h0, 1) new_big_h0 = tensor.unbroadcast(new_big_h0, 1) updates.append((last_h0, new_h0)) updates.append((last_big_h0, new_big_h0)) # cost = cost + 80.*cost_raw alpha_ = numpy.float32(0.) beta_ = numpy.float32(1.) cost = alpha_*cost + beta_*cost_raw attention_vars = [next_x, k, w, coeff, phi, pi_att] return cost, scan_updates + updates, attention_vars, cost_raw @application def sample_model_fun( self, labels, labels_mask, speaker, num_samples, seq_size): initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \ initial_w, last_w, initial_k, last_k = \ self.initial_states(num_samples) initial_x = numpy.zeros( (num_samples, self.output_dim), dtype=floatX) cell_shape = (seq_size, num_samples, self.rnn_h_dim) gat_shape = (seq_size, num_samples, 2 * self.rnn_h_dim) cell_h1 = tensor.zeros(cell_shape, dtype=floatX) cell_h2 = tensor.zeros(cell_shape, dtype=floatX) cell_h3 = tensor.zeros(cell_shape, dtype=floatX) gat_h1 = tensor.zeros(gat_shape, dtype=floatX) gat_h2 = tensor.zeros(gat_shape, dtype=floatX) gat_h3 = tensor.zeros(gat_shape, dtype=floatX) if self.use_speaker: speaker = speaker[:, 0] emb_speaker = self.embed_speaker.apply(speaker) # Applied before the broadcast. spk_readout = self.speaker_to_readout.apply(emb_speaker) spk_output = self.speaker_to_output.apply(emb_speaker) # Add dimension to repeat with time. emb_speaker = tensor.shape_padleft(emb_speaker) spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker) spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker) spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker) to_normalize = [ spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3, spk_gat_h3] spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \ spk_cell_h3, spk_gat_h3, = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1 += spk_cell_h1 cell_h2 += spk_cell_h2 cell_h3 += spk_cell_h3 gat_h1 += spk_gat_h1 gat_h2 += spk_gat_h2 gat_h3 += spk_gat_h3 context_oh = self.encoder.apply(labels) * \ tensor.shape_padright(labels_mask) u = tensor.shape_padleft( tensor.arange(labels.shape[1], dtype=floatX), 2) def sample_step( inp_cell_h1_t, inp_gat_h1_t, inp_cell_h2_t, inp_gat_h2_t, inp_cell_h3_t, inp_gat_h3_t, x_tm1, h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1): cell_h1_t = inp_cell_h1_t cell_h2_t = inp_cell_h2_t cell_h3_t = inp_cell_h3_t gat_h1_t = inp_gat_h1_t gat_h2_t = inp_gat_h2_t gat_h3_t = inp_gat_h3_t attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1) cell_h1_t += attinp_h1 gat_h1_t += attgat_h1 if self.weak_feedback: out_cell_h1_t, out_gat_h1_t = self.out_to_h1.apply(x_tm1) to_normalize = [ out_cell_h1_t, out_gat_h1_t] out_cell_h1_t, out_gat_h1_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h1_t += out_cell_h1_t gat_h1_t += out_gat_h1_t if self.full_feedback: out_cell_h2_t, out_gat_h2_t = self.out_to_h2.apply(x_tm1) out_cell_h3_t, out_gat_h3_t = self.out_to_h3.apply(x_tm1) to_normalize = [ out_cell_h2_t, out_gat_h2_t, out_cell_h3_t, out_gat_h3_t] out_cell_h2_t, out_gat_h2_t, \ out_cell_h3_t, out_gat_h3_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] cell_h2_t += out_cell_h2_t cell_h3_t += out_cell_h3_t gat_h2_t += out_gat_h2_t gat_h3_t += out_gat_h3_t h1_t = self.rnn1.apply( cell_h1_t, gat_h1_t, h1_tm1, iterate=False) a_t, b_t, k_t = self.h1_to_att.apply(h1_t) if self.attention_type == "softmax": a_t = tensor.nnet.softmax(a_t) + self.epsilon else: a_t = tensor.exp(a_t) + self.epsilon b_t = tensor.exp(b_t) * self.sharpening_coeff + self.epsilon k_t = k_tm1 + self.attention_alignment * \ tensor.exp(k_t) / self.timing_coeff a_t_ = a_t a_t = tensor.shape_padright(a_t) b_t = tensor.shape_padright(b_t) k_t_ = tensor.shape_padright(k_t) # batch size X att size X len context if self.attention_type == "softmax": # numpy.sqrt(1/(2*numpy.pi)) is the weird number phi_t = 0.3989422917366028 * tensor.sum( a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1) else: phi_t = tensor.sum( a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1) # batch size X len context X num letters w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1) attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t) attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t) cell_h2_t += attinp_h2 gat_h2_t += attgat_h2 cell_h3_t += attinp_h3 gat_h3_t += attgat_h3 h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t) h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t) to_normalize = [ h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3] h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h2_t = self.rnn2.apply( cell_h2_t + h1inp_h2, gat_h2_t + h1gat_h2, h2_tm1, iterate=False) h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t) to_normalize = [ h2inp_h3, h2gat_h3] h2inp_h3, h2gat_h3 = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] h3_t = self.rnn3.apply( cell_h3_t + h1inp_h3 + h2inp_h3, gat_h3_t + h1gat_h3 + h2gat_h3, h3_tm1, iterate=False) h1_out_t = self.h1_to_readout.apply(h1_t) h2_out_t = self.h2_to_readout.apply(h2_t) h3_out_t = self.h3_to_readout.apply(h3_t) to_normalize = [ h1_out_t, h2_out_t, h3_out_t] h1_out_t, h2_out_t, h3_out_t = \ [_apply_norm(x, self.layer_norm) for x in to_normalize] readout_t = h1_out_t + h2_out_t + h3_out_t readout_t += self.att_to_readout.apply(w_t) if self.use_speaker: readout_t += spk_readout output_t = self.readout_to_output.apply(readout_t) if self.which_cost == 'MSE': predicted_x_t = output_t if self.use_speaker: predicted_x_t += spk_output # Dummy value for coeff_t coeff_t = predicted_x_t elif self.which_cost == "GMM": mu_t, sigma_t, coeff_t = output_t if self.use_speaker: mu_t += spk_output[0] sigma_t += spk_output[1] coeff_t += spk_output[2] sigma_t = tensor.exp(sigma_t - self.sampling_bias) + \ self.epsilon coeff_t = tensor.nnet.softmax( coeff_t.reshape( (-1, self.k_gmm)) * (1. + self.sampling_bias)).reshape( coeff_t.shape) + self.epsilon predicted_x_t = sample_gmm( mu_t, sigma_t, coeff_t, self.theano_rng) return predicted_x_t, h1_t, h2_t, h3_t, \ k_t, w_t, coeff_t, phi_t, a_t_ (sample_x, h1, h2, h3, k, w, pi, phi, pi_att), updates = theano.scan( fn=sample_step, sequences=[ cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3], non_sequences=[], outputs_info=[ initial_x, initial_h1, initial_h2, initial_h3, initial_k, initial_w, None, None, None]) return sample_x, k, w, pi, phi, pi_att, updates def sample_model( self, labels_tr, labels_mask_tr, features_mask_tr, speaker_tr, num_samples, num_steps): features, features_mask, labels, labels_mask, speaker, start_flag, raw_sequence = \ self.symbolic_input_variables() sample_x, k, w, pi, phi, pi_att, updates = \ self.sample_model_fun( labels, labels_mask, speaker, num_samples, num_steps) theano_inputs = [labels, labels_mask] numpy_inputs = (labels_tr, labels_mask_tr) if self.use_speaker: theano_inputs += [speaker] numpy_inputs += (speaker_tr,) return function( theano_inputs, [sample_x, k, w, pi, phi, pi_att], updates=updates)(*numpy_inputs) def sample_using_input(self, data_tr, num_samples): # Used to predict the values using the dataset features, features_mask, labels, labels_mask, speaker, start_flag, raw_sequence = \ self.symbolic_input_variables() cost, updates, attention_vars = self.compute_cost( features, features_mask, labels, labels_mask, speaker, start_flag, num_samples) sample_x, k, w, pi, phi, pi_att = attention_vars theano_vars = [ features, features_mask, labels, labels_mask, speaker, start_flag] theano_vars = [x for x in theano_vars if x is not None] theano_vars = list(set(theano_vars)) theano_vars = {x.name: x for x in theano_vars} theano_inputs = [] numpy_inputs = [] for key in data_tr.keys(): theano_inputs.append(theano_vars[key]) numpy_inputs.append(data_tr[key]) return function( theano_inputs, [sample_x, k, w, pi, phi, pi_att], updates=updates)(*numpy_inputs)
args = getArguments() corpus = Corpus(open(args.corpus).read()) train_data,vocab_size = createDataset( corpus = corpus, sequence_length = 750, repeat = 20 ) if args.mode == "train": seq_len = 100 dim = 100 feedback_dim = 100 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim = vocab_size, source_names = ["states"], # transition.apply.states ??? emitter = SoftmaxEmitter(name = "emitter"), feedback_brick = LookupFeedback( vocab_size, feedback_dim, name = 'feedback' ), name = "readout"), transition, weights_init = IsotropicGaussian(0.01), biases_init = Constant(0), name = "generator" )
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of generating a Markov chain with RNN.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "sample"], help="The mode to run. Use `train` to train a new model" " and `sample` to sample a sequence generated by an" " existing one.") parser.add_argument( "save_path", default="sine", help="The part to save PyLearn2 model") parser.add_argument( "--steps", type=int, default=100, help="Number of steps to plot") parser.add_argument( "--reset", action="store_true", default=False, help="Start training from scratch") args = parser.parse_args() num_states = ChainDataset.num_states if args.mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator( LinearReadout(readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() logger.debug("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.debug("Markov chain entropy: {}".format( ChainDataset.entropy)) logger.debug("Expected min error: {}".format( -ChainDataset.entropy * seq_len * batch_size)) if os.path.isfile(args.save_path) and not args.reset: model = Pylearn2Model.load(args.save_path) else: model = Pylearn2Model(generator) # Build the cost computation graph. # Note: would be probably nicer to make cost part of the model. x = tensor.ltensor3('x') cost = Pylearn2Cost(model.brick.cost(x[:, :, 0]).sum()) dataset = ChainDataset(rng, seq_len) sgd = SGD(learning_rate=0.0001, cost=cost, batch_size=batch_size, batches_per_iter=10, monitoring_dataset=dataset, monitoring_batch_size=batch_size, monitoring_batches=1, learning_rule=Pylearn2LearningRule( SGDLearningRule(), dict(training_objective=cost.cost))) train = Pylearn2Train(dataset, model, algorithm=sgd, save_path=args.save_path, save_freq=10) train.main_loop() elif args.mode == "sample": model = Pylearn2Model.load(args.save_path) generator = model.brick sample = ComputationGraph(generator.generate( n_steps=args.steps, batch_size=1, iterate=True)).function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, ChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, ChainDataset.trans_prob)) else: assert False
def main(mode, save_path, steps, num_batches, load_params): chars = (list(string.ascii_uppercase) + list(range(10)) + [' ', '.', ',', '\'', '"', '!', '?', '<UNK>']) char_to_ind = {char: i for i, char in enumerate(chars)} ind_to_char = {v: k for k, v in char_to_ind.iteritems()} train_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_train'], char_to_ind, bos_token=None, eos_token=None, level='character') valid_dataset = TextFile(['/Tmp/serdyuk/data/wsj_text_valid'], char_to_ind, bos_token=None, eos_token=None, level='character') vocab_size = len(char_to_ind) logger.info('Dictionary size: {}'.format(vocab_size)) if mode == 'continue': continue_training(save_path) return elif mode == "sample": main_loop = load(open(save_path, "rb")) generator = main_loop.model.get_top_bricks()[-1] sample = ComputationGraph(generator.generate( n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] print("".join([ind_to_char[s] for s in outputs])) numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() trans_freqs = numpy.zeros((vocab_size, vocab_size), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] return # Experiment configuration batch_size = 20 dim = 650 feedback_dim = 650 valid_stream = valid_dataset.get_example_stream() valid_stream = Batch(valid_stream, iteration_scheme=ConstantScheme(batch_size)) valid_stream = Padding(valid_stream) valid_stream = Mapping(valid_stream, _transpose) # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout(readout_dim=vocab_size, source_names=transition.apply.states, emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( vocab_size, feedback_dim, name='feedback'), name="readout"), transition, weights_init=Uniform(std=0.04), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() transition.push_initialization_config() generator.initialize() # Build the cost computation graph. features = tensor.lmatrix('features') features_mask = tensor.matrix('features_mask') cost_matrix = generator.cost_matrix( features, mask=features_mask) batch_cost = cost_matrix.sum() cost = aggregation.mean( batch_cost, features.shape[1]) cost.name = "sequence_log_likelihood" char_cost = aggregation.mean( batch_cost, features_mask.sum()) char_cost.name = 'character_log_likelihood' ppl = 2 ** (cost / numpy.log(2)) ppl.name = 'ppl' bits_per_char = char_cost / tensor.log(2) bits_per_char.name = 'bits_per_char' length = features.shape[0] length.name = 'length' model = Model(batch_cost) if load_params: params = load_parameter_values(save_path) model.set_parameter_values(params) if mode == "train": # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_parameters().items()], width=120)) train_stream = train_dataset.get_example_stream() train_stream = Mapping(train_stream, _truncate) train_stream = Batch(train_stream, iteration_scheme=ConstantScheme(batch_size)) train_stream = Padding(train_stream) train_stream = Mapping(train_stream, _transpose) parameters = model.get_parameter_dict() maxnorm_subjects = VariableFilter(roles=[WEIGHT])(parameters.values()) algorithm = GradientDescent( cost=batch_cost, parameters=parameters.values(), step_rule=CompositeRule([StepClipping(1000.), AdaDelta(epsilon=1e-8) #, Restrict(VariableClipping(1.0, axis=0), maxnorm_subjects) ])) ft = features[:6, 0] ft.name = 'feature_example' observables = [cost, ppl, char_cost, length, bits_per_char] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) track_the_best_bpc = TrackTheBest('valid_bits_per_char') root_path, extension = os.path.splitext(save_path) this_step_monitoring = TrainingDataMonitoring( observables + [ft], prefix="this_step", after_batch=True) average_monitoring = TrainingDataMonitoring( observables + [algorithm.total_step_norm, algorithm.total_gradient_norm], prefix="average", every_n_batches=10) valid_monitoring = DataStreamMonitoring( observables, prefix="valid", every_n_batches=1500, before_training=False, data_stream=valid_stream) main_loop = MainLoop( algorithm=algorithm, data_stream=train_stream, model=model, extensions=[ this_step_monitoring, average_monitoring, valid_monitoring, track_the_best_bpc, Checkpoint(save_path, ), Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_bpc.notification_name), (root_path + "_best" + extension,)), Timing(after_batch=True), Printing(every_n_batches=10), Plot(root_path, [[average_monitoring.record_name(cost), valid_monitoring.record_name(cost)], [average_monitoring.record_name(algorithm.total_step_norm)], [average_monitoring.record_name(algorithm.total_gradient_norm)], [average_monitoring.record_name(ppl), valid_monitoring.record_name(ppl)], [average_monitoring.record_name(char_cost), valid_monitoring.record_name(char_cost)], [average_monitoring.record_name(bits_per_char), valid_monitoring.record_name(bits_per_char)]], every_n_batches=10) ]) main_loop.run() elif mode == 'evaluate': with open('/data/lisatmp3/serdyuk/wsj_lms/lms/wsj_trigram_with_initial_eos/lexicon.txt') as f: raw_words = [line.split()[1:-1] for line in f.readlines()] words = [[char_to_ind[c] if c in char_to_ind else char_to_ind['<UNK>'] for c in w] for w in raw_words] max_word_length = max([len(w) for w in words]) initial_states = tensor.matrix('init_states') cost_matrix_step = generator.cost_matrix(features, mask=features_mask, states=initial_states) cg = ComputationGraph(cost_matrix_step) states = cg.auxiliary_variables[-2] compute_cost = theano.function([features, features_mask, initial_states], [cost_matrix_step.sum(axis=0), states]) cost_matrix = generator.cost_matrix(features, mask=features_mask) initial_cg = ComputationGraph(cost_matrix) initial_states = initial_cg.auxiliary_variables[-2] total_word_cost = 0 num_words = 0 examples = numpy.zeros((max_word_length + 1, len(words)), dtype='int64') all_masks = numpy.zeros((max_word_length + 1, len(words)), dtype=floatX) for i, word in enumerate(words): examples[:len(word), i] = word all_masks[:len(word), i] = 1. single_space = numpy.array([char_to_ind[' ']])[:, None] for batch in valid_stream.get_epoch_iterator(): for example, mask in equizip(batch[0].T, batch[1].T): example = example[:(mask.sum())] spc_inds = list(numpy.where(example == char_to_ind[" "])[0]) state = generator.transition.transition.initial_states_.get_value()[None, :] for i, j in equizip([-1] + spc_inds, spc_inds + [-1]): word = example[(i+1):j, None] word_cost, states = compute_cost( word, numpy.ones_like(word, dtype=floatX), state) state = states[-1] costs = numpy.exp(-compute_cost( examples, all_masks, numpy.tile(state, [examples.shape[1], 1]))[0]) _, space_states = compute_cost( single_space, numpy.ones_like(single_space, dtype=floatX), state) state = space_states[-1] word_prob = numpy.exp(-word_cost) total_word_cost += word_cost + numpy.log(numpy.sum(costs)) num_words += 1 print(word_prob) print(numpy.sum(costs)) print("Average cost", total_word_cost / num_words) print("PPL", numpy.exp(total_word_cost / num_words)) print("Word-level perplexity") print(total_word_cost / num_words) else: assert False
activations_x = [Rectifier()] * depth_x dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \ [hidden_size_recurrent] activations_theta = [Rectifier()] * depth_theta dims_theta = [hidden_size_recurrent] + \ [hidden_size_mlp_theta]*depth_theta mlp_x = MLP(activations=activations_x, dims=dims_x) feedback = DeepTransitionFeedback(mlp=mlp_x) transition = [ GatedRecurrent(dim=hidden_size_recurrent, name="gru_{}".format(i)) for i in range(depth_recurrent) ] transition = RecurrentStack(transition, name="transition", skip_connections=True) mlp_theta = MLP(activations=activations_theta, dims=dims_theta) mlp_gmm = GMMMLP(mlp=mlp_theta, dim=target_size, k=k, const=0.00001) emitter = GMMEmitter(gmmmlp=mlp_gmm, output_size=frame_size, k=k, name="emitter")
class TestGatedRecurrent(unittest.TestCase): def setUp(self): self.gated = GatedRecurrent( dim=3, weights_init=Constant(2), activation=Tanh(), gate_activation=Tanh()) self.gated.initialize() self.reset_only = GatedRecurrent( dim=3, weights_init=IsotropicGaussian(), activation=Tanh(), gate_activation=Tanh(), use_update_gate=False, rng=numpy.random.RandomState(1)) self.reset_only.initialize() def test_one_step(self): h0 = tensor.matrix('h0') x = tensor.matrix('x') z = tensor.matrix('z') r = tensor.matrix('r') h1 = self.gated.apply(x, z, r, h0, iterate=False) next_h = theano.function(inputs=[h0, x, z, r], outputs=[h1]) h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=floatX) x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]], dtype=floatX) zi_val = (h0_val + x_val) / 2 ri_val = -x_val W_val = 2 * numpy.ones((3, 3), dtype=floatX) z_val = numpy.tanh(h0_val.dot(W_val) + zi_val) r_val = numpy.tanh(h0_val.dot(W_val) + ri_val) h1_val = (z_val * numpy.tanh((r_val * h0_val).dot(W_val) + x_val) + (1 - z_val) * h0_val) assert_allclose(h1_val, next_h(h0_val, x_val, zi_val, ri_val)[0], rtol=1e-6) def test_reset_only_many_steps(self): x = tensor.tensor3('x') ri = tensor.tensor3('ri') mask = tensor.matrix('mask') h = self.reset_only.apply(x, reset_inputs=ri, mask=mask) calc_h = theano.function(inputs=[x, ri, mask], outputs=[h]) x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))), dtype=floatX) x_val = numpy.ones((24, 4, 3), dtype=floatX) * x_val[..., None] ri_val = 0.3 - x_val mask_val = numpy.ones((24, 4), dtype=floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=floatX) W = self.reset_only.state_to_state.get_value() U = self.reset_only.state_to_reset.get_value() for i in range(1, 25): r_val = numpy.tanh(h_val[i - 1].dot(U) + ri_val[i - 1]) h_val[i] = numpy.tanh((r_val * h_val[i - 1]).dot(W) + x_val[i - 1]) h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) h_val = h_val[1:] # TODO Figure out why this tolerance needs to be so big assert_allclose(h_val, calc_h(x_val, ri_val, mask_val)[0], 1e-03)
def __init__( self, input_dim=420, # Dimension of the text labels output_dim=63, # Dimension of vocoder fram rnn_h_dim=1024, # Size of rnn hidden state readouts_dim=1024, # Size of readouts (summary of rnn) weak_feedback=False, # Feedback to the top rnn layer full_feedback=False, # Feedback to all rnn layers feedback_noise_level=None, # Amount of noise in feedback layer_norm=False, # Use simple normalization? use_speaker=False, # Condition on the speaker id? num_speakers=21, # How many speakers there are? speaker_dim=128, # Size of speaker embedding which_cost='MSE', # Train with MSE or GMM k_gmm=20, # How many components in the GMM sampling_bias=0, # Make samples more likely (Graves13) epsilon=1e-5, # Numerical stabilities num_characters=43, # how many chars in the labels attention_type='graves', # graves or softmax attention_size=10, # number of gaussians in the attention attention_alignment=1., # audio steps per letter at initialization sharpening_coeff=1., timing_coeff=1., encoder_type=None, encoder_dim=128, raw_output=False, **kwargs): super(Parrot, self).__init__(**kwargs) self.input_dim = input_dim self.output_dim = output_dim self.rnn_h_dim = rnn_h_dim self.readouts_dim = readouts_dim self.layer_norm = layer_norm self.which_cost = which_cost self.use_speaker = use_speaker self.full_feedback = full_feedback self.feedback_noise_level = feedback_noise_level self.epsilon = epsilon self.num_characters = num_characters self.attention_type = attention_type self.attention_alignment = attention_alignment self.attention_size = attention_size self.sharpening_coeff = sharpening_coeff self.timing_coeff = timing_coeff self.encoder_type = encoder_type self.encoder_dim = encoder_dim self.encoded_input_dim = input_dim self.raw_output = raw_output if self.encoder_type == 'bidirectional': self.encoded_input_dim = 2 * encoder_dim if self.feedback_noise_level is not None: self.noise_level_var = tensor.scalar('feedback_noise_level') self.rnn1 = GatedRecurrent(dim=rnn_h_dim, name='rnn1') self.rnn2 = GatedRecurrent(dim=rnn_h_dim, name='rnn2') self.rnn3 = GatedRecurrent(dim=rnn_h_dim, name='rnn3') self.h1_to_readout = Linear( input_dim=rnn_h_dim, output_dim=readouts_dim, name='h1_to_readout') self.h2_to_readout = Linear( input_dim=rnn_h_dim, output_dim=readouts_dim, name='h2_to_readout') self.h3_to_readout = Linear( input_dim=rnn_h_dim, output_dim=readouts_dim, name='h3_to_readout') self.h1_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h2') self.h1_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h1_to_h3') self.h2_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=rnn_h_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='h2_to_h3') if which_cost == 'MSE': self.readout_to_output = Linear( input_dim=readouts_dim, output_dim=output_dim, name='readout_to_output') elif which_cost == 'GMM': self.sampling_bias = sampling_bias self.k_gmm = k_gmm self.readout_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=readouts_dim, output_dims=[output_dim * k_gmm, output_dim * k_gmm, k_gmm], name='readout_to_output') self.encoder = Encoder( encoder_type, num_characters, input_dim, encoder_dim, name='encoder') self.children = [ self.encoder, self.rnn1, self.rnn2, self.rnn3, self.h1_to_readout, self.h2_to_readout, self.h3_to_readout, self.h1_to_h2, self.h1_to_h3, self.h2_to_h3, self.readout_to_output] self.inp_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h1') self.inp_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h2') self.inp_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=self.encoded_input_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='inp_to_h3') self.children += [ self.inp_to_h1, self.inp_to_h2, self.inp_to_h3] self.h1_to_att = Fork( output_names=['alpha', 'beta', 'kappa'], input_dim=rnn_h_dim, output_dims=[attention_size] * 3, name='h1_to_att') self.att_to_readout = Linear( input_dim=self.encoded_input_dim, output_dim=readouts_dim, name='att_to_readout') self.children += [ self.h1_to_att, self.att_to_readout] if use_speaker: self.num_speakers = num_speakers self.speaker_dim = speaker_dim self.embed_speaker = LookupTable(num_speakers, speaker_dim) self.speaker_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h1') self.speaker_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h2') self.speaker_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=speaker_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='speaker_to_h3') self.speaker_to_readout = Linear( input_dim=speaker_dim, output_dim=readouts_dim, name='speaker_to_readout') if which_cost == 'MSE': self.speaker_to_output = Linear( input_dim=speaker_dim, output_dim=output_dim, name='speaker_to_output') elif which_cost == 'GMM': self.speaker_to_output = Fork( output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'], input_dim=speaker_dim, output_dims=[ output_dim * k_gmm, output_dim * k_gmm, k_gmm], name='speaker_to_output') self.children += [ self.embed_speaker, self.speaker_to_h1, self.speaker_to_h2, self.speaker_to_h3, self.speaker_to_readout, self.speaker_to_output] if full_feedback: self.out_to_h2 = Fork( output_names=['rnn2_inputs', 'rnn2_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h2') self.out_to_h3 = Fork( output_names=['rnn3_inputs', 'rnn3_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h3') self.children += [ self.out_to_h2, self.out_to_h3] weak_feedback = True self.weak_feedback = weak_feedback if weak_feedback: self.out_to_h1 = Fork( output_names=['rnn1_inputs', 'rnn1_gates'], input_dim=output_dim, output_dims=[rnn_h_dim, 2 * rnn_h_dim], name='out_to_h1') self.children += [ self.out_to_h1] if self.raw_output: self.sampleRnn = SampleRnn() self.children += [self.sampleRnn]
def train(): if os.path.isfile('trainingdata.tar'): with open('trainingdata.tar', 'rb') as f: main = load(f) else: hidden_size = 512 filename = 'warpeace.hdf5' encoder = HDF5CharEncoder('warpeace_input.txt', 1000) encoder.write(filename) alphabet_len = encoder.length x = theano.tensor.lmatrix('x') readout = Readout( readout_dim=alphabet_len, feedback_brick=LookupFeedback(alphabet_len, hidden_size, name='feedback'), source_names=['states'], emitter=RandomSoftmaxEmitter(), name='readout' ) transition = GatedRecurrent( activation=Tanh(), dim=hidden_size) transition.weights_init = IsotropicGaussian(0.01) gen = SequenceGenerator(readout=readout, transition=transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name='sequencegenerator') gen.push_initialization_config() gen.initialize() cost = gen.cost(outputs=x) cost.name = 'cost' cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(0.5)) train_set = encoder.get_dataset() train_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme( train_set.num_examples, batch_size=128)) main = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ FinishAfter(), Printing(), Checkpoint('trainingdata.tar', every_n_epochs=10), ShowOutput(every_n_epochs=10) ]) main.run()
class TestGatedRecurrent(unittest.TestCase): def setUp(self): self.gated = GatedRecurrent( dim=3, weights_init=Constant(2), activation=Tanh(), gate_activation=Tanh()) self.gated.initialize() self.reset_only = GatedRecurrent( dim=3, weights_init=IsotropicGaussian(), activation=Tanh(), gate_activation=Tanh(), use_update_gate=False, seed=1) self.reset_only.initialize() def test_one_step(self): h0 = tensor.matrix('h0') x = tensor.matrix('x') z = tensor.matrix('z') r = tensor.matrix('r') h1 = self.gated.apply(x, z, r, h0, iterate=False) next_h = theano.function(inputs=[h0, x, z, r], outputs=[h1]) h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]], dtype=floatX) x_val = 0.1 * numpy.array([[1, 2, 3], [4, 5, 6]], dtype=floatX) zi_val = (h0_val + x_val) / 2 ri_val = -x_val W_val = 2 * numpy.ones((3, 3), dtype=floatX) z_val = numpy.tanh(h0_val.dot(W_val) + zi_val) r_val = numpy.tanh(h0_val.dot(W_val) + ri_val) h1_val = (z_val * numpy.tanh((r_val * h0_val).dot(W_val) + x_val) + (1 - z_val) * h0_val) assert_allclose(h1_val, next_h(h0_val, x_val, zi_val, ri_val)[0], rtol=1e-6) def test_reset_only_many_steps(self): x = tensor.tensor3('x') ri = tensor.tensor3('ri') mask = tensor.matrix('mask') h = self.reset_only.apply(x, reset_inputs=ri, mask=mask) calc_h = theano.function(inputs=[x, ri, mask], outputs=[h]) x_val = 0.1 * numpy.asarray(list(itertools.permutations(range(4))), dtype=floatX) x_val = numpy.ones((24, 4, 3), dtype=floatX) * x_val[..., None] ri_val = 0.3 - x_val mask_val = numpy.ones((24, 4), dtype=floatX) mask_val[12:24, 3] = 0 h_val = numpy.zeros((25, 4, 3), dtype=floatX) W = self.reset_only.state_to_state.get_value() U = self.reset_only.state_to_reset.get_value() for i in range(1, 25): r_val = numpy.tanh(h_val[i - 1].dot(U) + ri_val[i - 1]) h_val[i] = numpy.tanh((r_val * h_val[i - 1]).dot(W) + x_val[i - 1]) h_val[i] = (mask_val[i - 1, :, None] * h_val[i] + (1 - mask_val[i - 1, :, None]) * h_val[i - 1]) h_val = h_val[1:] # TODO Figure out why this tolerance needs to be so big assert_allclose(h_val, calc_h(x_val, ri_val, mask_val)[0], 1e-03)
def __init__(self, config): inp = tensor.imatrix('bytes') embed = theano.shared(config.embedding_matrix.astype(theano.config.floatX), name='embedding_matrix') in_repr = embed[inp.flatten(), :].reshape((inp.shape[0], inp.shape[1], config.repr_dim)) in_repr.name = 'in_repr' bricks = [] states = [] # Construct predictive GRU hierarchy hidden = [] costs = [] next_target = in_repr.dimshuffle(1, 0, 2) for i, (hdim, cf, q) in enumerate(zip(config.hidden_dims, config.cost_factors, config.hidden_q)): init_state = theano.shared(numpy.zeros((config.num_seqs, hdim)).astype(theano.config.floatX), name='st0_%d'%i) linear = Linear(input_dim=config.repr_dim, output_dim=3*hdim, name="lstm_in_%d"%i) lstm = GatedRecurrent(dim=hdim, activation=config.activation_function, name="lstm_rec_%d"%i) linear2 = Linear(input_dim=hdim, output_dim=config.repr_dim, name='lstm_out_%d'%i) tanh = Tanh('lstm_out_tanh_%d'%i) bricks += [linear, lstm, linear2, tanh] if i > 0: linear1 = Linear(input_dim=config.hidden_dims[i-1], output_dim=3*hdim, name='lstm_in2_%d'%i) bricks += [linear1] next_target = tensor.cast(next_target, dtype=theano.config.floatX) inter = linear.apply(theano.gradient.disconnected_grad(next_target)) if i > 0: inter += linear1.apply(theano.gradient.disconnected_grad(hidden[-1][:-1,:,:])) new_hidden = lstm.apply(inputs=inter[:,:,:hdim], gate_inputs=inter[:,:,hdim:], states=init_state) states.append((init_state, new_hidden[-1, :, :])) hidden += [tensor.concatenate([init_state[None,:,:], new_hidden],axis=0)] pred = tanh.apply(linear2.apply(hidden[-1][:-1,:,:])) costs += [numpy.float32(cf) * (-next_target * pred).sum(axis=2).mean()] costs += [numpy.float32(cf) * q * abs(pred).sum(axis=2).mean()] diff = next_target - pred next_target = tensor.ge(diff, 0.5) - tensor.le(diff, -0.5) # Construct output from hidden states hidden = [s.dimshuffle(1, 0, 2) for s in hidden] out_parts = [] out_dims = config.out_hidden + [config.io_dim] for i, (dim, state) in enumerate(zip(config.hidden_dims, hidden)): pred_linear = Linear(input_dim=dim, output_dim=out_dims[0], name='pred_linear_%d'%i) bricks.append(pred_linear) lin = theano.gradient.disconnected_grad(state) out_parts.append(pred_linear.apply(lin)) # Do prediction and calculate cost out = sum(out_parts) if len(out_dims) > 1: out = config.out_hidden_act[0](name='out_act0').apply(out) mlp = MLP(dims=out_dims, activations=[x(name='out_act%d'%i) for i, x in enumerate(config.out_hidden_act[1:])] +[Identity()], name='out_mlp') bricks.append(mlp) out = mlp.apply(out.reshape((inp.shape[0]*(inp.shape[1]+1),-1)) ).reshape((inp.shape[0],inp.shape[1]+1,-1)) pred = out.argmax(axis=2) cost = Softmax().categorical_cross_entropy(inp.flatten(), out[:,:-1,:].reshape((inp.shape[0]*inp.shape[1], config.io_dim))).mean() error_rate = tensor.neq(inp.flatten(), pred[:,:-1].flatten()).mean() sgd_cost = cost + sum(costs) # Initialize all bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize() # apply noise cg = ComputationGraph([sgd_cost, cost, error_rate]+costs) if config.weight_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.weight_noise) sgd_cost = cg.outputs[0] cost = cg.outputs[1] error_rate = cg.outputs[2] costs = cg.outputs[3:] # put stuff into self that is usefull for training or extensions self.sgd_cost = sgd_cost sgd_cost.name = 'sgd_cost' for i in range(len(costs)): costs[i].name = 'pred_cost_%d'%i cost.name = 'cost' error_rate.name = 'error_rate' self.monitor_vars = [costs, [cost], [error_rate]] self.out = out[:,1:,:] self.pred = pred[:,1:] self.states = states
class GatedRecurrentFull(Initializable): """A wrapper around the GatedRecurrent brick that improves usability. It contains: * A fork to map to initialize the reset and the update units. * Better initialization to initialize the different pieces While this works, there is probably a better more elegant way to do this. Parameters ---------- hidden_dim : int dimension of the hidden state activation : :class:`.Brick` gate_activation: :class:`.Brick` state_to_state_init: object Weight Initialization state_to_reset_init: object Weight Initialization state_to_update_init: obje64 Weight Initialization input_to_state_transform: :class:`.Brick` [CvMG14] uses Linear transform input_to_reset_transform: :class:`.Brick` [CvMG14] uses Linear transform input_to_update_transform: :class:`.Brick` [CvMG14] uses Linear transform References --------- self.rnn = GatedRecurrent( weights_init=Constant(np.nan), dim=self.hidden_dim, activation=self.activation, gate_activation=self.gate_activation) .. [CvMG14] Kyunghyun Cho, Bart van Merriënboer, Çağlar Gülçehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, and Yoshua Bengio, *Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation*, EMNLP (2014), pp. 1724-1734. """ @lazy(allocation=['hidden_dim', 'state_to_state_init', 'state_to_update_init', 'state_to_reset_init'], initialization=['input_to_state_transform', 'input_to_update_transform', 'input_to_reset_transform']) def __init__(self, hidden_dim, activation=None, gate_activation=None, state_to_state_init=None, state_to_update_init=None, state_to_reset_init=None, input_to_state_transform=None, input_to_update_transform=None, input_to_reset_transform=None, **kwargs): super(GatedRecurrentFull, self).__init__(**kwargs) self.hidden_dim = hidden_dim self.state_to_state_init = state_to_state_init self.state_to_update_init = state_to_update_init self.state_to_reset_init = state_to_reset_init self.input_to_state_transform = input_to_state_transform self.input_to_update_transform = input_to_update_transform self.input_to_reset_transform = input_to_reset_transform self.input_to_state_transform.name += "_input_to_state_transform" self.input_to_update_transform.name += "_input_to_update_transform" self.input_to_reset_transform.name += "_input_to_reset_transform" self.use_mine = True if self.use_mine: self.rnn = GatedRecurrentFast( weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) else: self.rnn = GatedRecurrent( weights_init=Constant(np.nan), dim=self.hidden_dim, activation=activation, gate_activation=gate_activation) self.children = [self.rnn, self.input_to_state_transform, self.input_to_update_transform, self.input_to_reset_transform] self.children.extend(self.rnn.children) def initialize(self): super(GatedRecurrentFull, self).initialize() self.input_to_state_transform.initialize() self.input_to_update_transform.initialize() self.input_to_reset_transform.initialize() self.rnn.initialize() weight_shape = (self.hidden_dim, self.hidden_dim) state_to_state = self.state_to_state_init.generate(rng=self.rng, shape=weight_shape) state_to_update= self.state_to_update_init.generate(rng=self.rng, shape=weight_shape) state_to_reset = self.state_to_reset_init.generate(rng=self.rng, shape=weight_shape) self.rnn.state_to_state.set_value(state_to_state) if self.use_mine: self.rnn.state_to_update.set_value(state_to_update) self.rnn.state_to_reset.set_value(state_to_reset) else: self.rnn.state_to_gates.set_value(np.hstack((state_to_update, state_to_reset))) @application(inputs=['input_'], outputs=['output']) def apply(self, input_, mask=None): """ Parameters ---------- inputs_ : :class:`~tensor.TensorVariable` sequence to feed into GRU. Axes are mb, sequence, features mask : :class:`~tensor.TensorVariable` A 1D binary array with 1 or 0 to represent data given available. Returns ------- output: :class:`theano.tensor.TensorVariable` sequence to feed out. Axes are batch, sequence, features """ states_from_in = self.input_to_state_transform.apply(input_) update_from_in = self.input_to_update_transform.apply(input_) reset_from_in = self.input_to_reset_transform.apply(input_) gate_inputs = tensor.concatenate([update_from_in, reset_from_in], axis=2) if self.use_mine: output = self.rnn.apply(inputs=states_from_in, update_inputs=update_from_in, reset_inputs=reset_from_in, mask=mask) else: output = self.rnn.apply(inputs=states_from_in, gate_inputs=gate_inputs) return output