Exemplo n.º 1
0
def example2():
    """GRU"""
    x = tensor.tensor3('x')
    dim = 3

    fork = Fork(input_dim=dim,
                output_dims=[dim, dim * 2],
                name='fork',
                output_names=["linear", "gates"],
                weights_init=initialization.Identity(),
                biases_init=Constant(0))
    gru = GatedRecurrent(dim=dim,
                         weights_init=initialization.Identity(),
                         biases_init=Constant(0))

    fork.initialize()
    gru.initialize()

    linear, gate_inputs = fork.apply(x)
    h = gru.apply(linear, gate_inputs)

    f = theano.function([x], h)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))

    doubler = Linear(input_dim=dim,
                     output_dim=dim,
                     weights_init=initialization.Identity(2),
                     biases_init=initialization.Constant(0))
    doubler.initialize()

    lin, gate = fork.apply(doubler.apply(x))
    h_doubler = gru.apply(lin, gate)

    f = theano.function([x], h_doubler)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
Exemplo n.º 2
0
def example2():
    """GRU"""
    x = tensor.tensor3('x')
    dim = 3

    fork = Fork(input_dim=dim, output_dims=[dim, dim*2],name='fork',output_names=["linear","gates"], weights_init=initialization.Identity(),biases_init=Constant(0))
    gru = GatedRecurrent(dim=dim, weights_init=initialization.Identity(),biases_init=Constant(0))

    fork.initialize()
    gru.initialize()

    linear, gate_inputs = fork.apply(x)
    h = gru.apply(linear, gate_inputs)

    f = theano.function([x], h)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) 

    doubler = Linear(
                 input_dim=dim, output_dim=dim, weights_init=initialization.Identity(2),
                 biases_init=initialization.Constant(0))
    doubler.initialize()

    lin, gate = fork.apply(doubler.apply(x))
    h_doubler = gru.apply(lin,gate)

    f = theano.function([x], h_doubler)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) 
class BidirectionalEncoder(Initializable):
    """Encoder of RNNsearch model."""
    def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs):
        super(BidirectionalEncoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim

        self.lookup = LookupTable(name='embeddings')
        self.bidir = NewBidirectional(
            GatedRecurrent(activation=Tanh(), dim=state_dim))
        self.fwd_fork = Fork([
            name
            for name in self.bidir.prototype.apply.sequences if name != 'mask'
        ],
                             prototype=Linear(),
                             name='fwd_fork')
        self.back_fork = Fork([
            name
            for name in self.bidir.prototype.apply.sequences if name != 'mask'
        ],
                              prototype=Linear(),
                              name='back_fork')

        self.children = [
            self.lookup, self.bidir, self.fwd_fork, self.back_fork
        ]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        self.fwd_fork.input_dim = self.embedding_dim
        self.fwd_fork.output_dims = [
            self.bidir.children[0].get_dim(name)
            for name in self.fwd_fork.output_names
        ]
        self.back_fork.input_dim = self.embedding_dim
        self.back_fork.output_dims = [
            self.bidir.children[1].get_dim(name)
            for name in self.back_fork.output_names
        ]

    @application(inputs=['source_sentence', 'source_sentence_mask'],
                 outputs=['representation'])
    def apply(self, source_sentence, source_sentence_mask):
        # Time as first dimension.
        source_sentence = source_sentence.T
        source_sentence_mask = source_sentence_mask.T

        embeddings = self.lookup.apply(source_sentence)

        representation = self.bidir.apply(
            # Conversion to embedding representation here.
            merge(self.fwd_fork.apply(embeddings, as_dict=True),
                  {'mask': source_sentence_mask}),
            merge(self.back_fork.apply(embeddings, as_dict=True),
                  {'mask': source_sentence_mask}))
        self.representation = representation
        return representation
Exemplo n.º 4
0
class BidirectionalEncoder(Initializable):
    """Encoder of RNNsearch model."""
    def __init__(self, embedding_dim, state_dim, **kwargs):
        super(BidirectionalEncoder, self).__init__(**kwargs)
        # self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim

        # self.lookup = LookupTable(name='embeddings')
        self.bidir = BidirectionalWMT15(
            GatedRecurrent(activation=Tanh(), dim=state_dim))
        self.fwd_fork = Fork([
            name
            for name in self.bidir.prototype.apply.sequences if name != 'mask'
        ],
                             prototype=Linear(),
                             name='fwd_fork')
        self.back_fork = Fork([
            name
            for name in self.bidir.prototype.apply.sequences if name != 'mask'
        ],
                              prototype=Linear(),
                              name='back_fork')

        self.children = [self.bidir, self.fwd_fork, self.back_fork]

    def _push_allocation_config(self):
        # self.lookup.length = self.vocab_size
        # self.lookup.dim = self.embedding_dim

        self.fwd_fork.input_dim = self.embedding_dim
        self.fwd_fork.output_dims = [
            self.bidir.children[0].get_dim(name)
            for name in self.fwd_fork.output_names
        ]
        self.back_fork.input_dim = self.embedding_dim
        self.back_fork.output_dims = [
            self.bidir.children[1].get_dim(name)
            for name in self.back_fork.output_names
        ]

    @application(inputs=['image_embedding'], outputs=['representation'])
    def apply(self, image_embedding):
        # Time as first dimension
        image_embedding_mask = tensor.ones(image_embedding.shape[:2])
        # print image_embedding.type

        # embeddings = self.lookup.apply(source_sentence)

        representation = self.bidir.apply(
            merge(self.fwd_fork.apply(image_embedding, as_dict=True),
                  {'mask': image_embedding_mask}),
            merge(self.back_fork.apply(image_embedding, as_dict=True),
                  {'mask': image_embedding_mask}))
        return representation
Exemplo n.º 5
0
class BidirectionalEncoder(Initializable):
    """ Bidirectional GRU encoder. """
    def __init__(self, embedding_dim, state_dim, **kwargs):
        super(BidirectionalEncoder, self).__init__(**kwargs)
        # Dimension of the word embeddings taken as input
        self.embedding_dim = embedding_dim
        # Hidden state dimension
        self.state_dim = state_dim

        # The bidir GRU
        self.bidir = BidirectionalFromDict(
            GatedRecurrent(activation=Tanh(), dim=state_dim))
        # Forks to administer the inputs of GRU gates
        self.fwd_fork = Fork([
            name
            for name in self.bidir.prototype.apply.sequences if name != 'mask'
        ],
                             prototype=Linear(),
                             name='fwd_fork')
        self.back_fork = Fork([
            name
            for name in self.bidir.prototype.apply.sequences if name != 'mask'
        ],
                              prototype=Linear(),
                              name='back_fork')

        self.children = [self.bidir, self.fwd_fork, self.back_fork]

    def _push_allocation_config(self):
        self.fwd_fork.input_dim = self.embedding_dim
        self.fwd_fork.output_dims = [
            self.bidir.children[0].get_dim(name)
            for name in self.fwd_fork.output_names
        ]
        self.back_fork.input_dim = self.embedding_dim
        self.back_fork.output_dims = [
            self.bidir.children[1].get_dim(name)
            for name in self.back_fork.output_names
        ]

    @application(inputs=['source_sentence_tbf', 'source_sentence_mask_tb'],
                 outputs=['representation'])
    def apply(self, source_sentence_tbf, source_sentence_mask_tb=None):

        representation_tbf = self.bidir.apply(
            merge(self.fwd_fork.apply(source_sentence_tbf, as_dict=True),
                  {'mask': source_sentence_mask_tb}),
            merge(self.back_fork.apply(source_sentence_tbf, as_dict=True),
                  {'mask': source_sentence_mask_tb}))
        return representation_tbf
Exemplo n.º 6
0
class BidirectionalEncoder(Initializable):
    """Encoder of RNNsearch model."""

    def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs):
        super(BidirectionalEncoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim

        self.lookup = LookupTable(name='embeddings')
        self.bidir = BidirectionalWMT15(
            GatedRecurrent(activation=Tanh(), dim=state_dim))
        self.fwd_fork = Fork(
            [name for name in self.bidir.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='fwd_fork')
        self.back_fork = Fork(
            [name for name in self.bidir.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='back_fork')

        self.children = [self.lookup, self.bidir,
                         self.fwd_fork, self.back_fork]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        self.fwd_fork.input_dim = self.embedding_dim
        self.fwd_fork.output_dims = [self.bidir.children[0].get_dim(name)
                                     for name in self.fwd_fork.output_names]
        self.back_fork.input_dim = self.embedding_dim
        self.back_fork.output_dims = [self.bidir.children[1].get_dim(name)
                                      for name in self.back_fork.output_names]

    @application(inputs=['source_sentence', 'source_sentence_mask'],
                 outputs=['representation'])
    def apply(self, source_sentence, source_sentence_mask):
        # Time as first dimension
        source_sentence = source_sentence.T
        source_sentence_mask = source_sentence_mask.T

        embeddings = self.lookup.apply(source_sentence)

        representation = self.bidir.apply(
            merge(self.fwd_fork.apply(embeddings, as_dict=True),
                  {'mask': source_sentence_mask}),
            merge(self.back_fork.apply(embeddings, as_dict=True),
                  {'mask': source_sentence_mask})
        )
        return representation
Exemplo n.º 7
0
class RecurrentWithFork(Initializable):

    @lazy(allocation=['input_dim'])
    def __init__(self, recurrent, input_dim, **kwargs):
        super(RecurrentWithFork, self).__init__(**kwargs)
        self.recurrent = recurrent
        self.input_dim = input_dim
        self.fork = Fork(
            [name for name in self.recurrent.sequences
             if name != 'mask'],
             prototype=Linear())
        self.children = [recurrent.brick, self.fork]

    def _push_allocation_config(self):
        self.fork.input_dim = self.input_dim
        self.fork.output_dims = [self.recurrent.brick.get_dim(name)
                                 for name in self.fork.output_names]

    @application(inputs=['input_', 'mask'])
    def apply(self, input_, mask=None, **kwargs):
        return self.recurrent(
            mask=mask, **dict_union(self.fork.apply(input_, as_dict=True),
                                    kwargs))

    @apply.property('outputs')
    def apply_outputs(self):
        return self.recurrent.states
Exemplo n.º 8
0
def gru_layer(dim, h, n):
    fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)],
                name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2])
    gru = GatedRecurrent(dim=dim, name='gru' + str(n))
    initialize([fork, gru])
    linear, gates = fork.apply(h)
    return gru.apply(linear, gates)
Exemplo n.º 9
0
class RecurrentWithFork(Initializable):
    # Obtained from Dima's code. @rizar
    # https://github.com/rizar/attention-lvcsr/blob/master/lvsr/bricks/__init__.py
    @lazy(allocation=['input_dim'])
    def __init__(self, recurrent, input_dim, **kwargs):
        super(RecurrentWithFork, self).__init__(**kwargs)
        self.recurrent = recurrent
        self.input_dim = input_dim
        self.fork = Fork(
            [name for name in self.recurrent.sequences if name != 'mask'],
            prototype=Linear())
        self.children = [recurrent.brick, self.fork]

    def _push_allocation_config(self):
        self.fork.input_dim = self.input_dim
        self.fork.output_dims = [
            self.recurrent.brick.get_dim(name)
            for name in self.fork.output_names
        ]

    @application(inputs=['input_', 'mask'])
    def apply(self, input_, mask=None, **kwargs):
        return self.recurrent(mask=mask,
                              **dict_union(
                                  self.fork.apply(input_, as_dict=True),
                                  kwargs))

    @apply.property('outputs')
    def apply_outputs(self):
        return self.recurrent.states
Exemplo n.º 10
0
def gru_layer(dim, h, n):
    fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)],
                name='fork' + str(n), input_dim=dim, output_dims=[dim, dim * 2])
    gru = GatedRecurrent(dim=dim, name='gru' + str(n))
    initialize([fork, gru])
    linear, gates = fork.apply(h)
    return gru.apply(linear, gates)
Exemplo n.º 11
0
class RecurrentWithFork(Initializable):
    @lazy(allocation=['input_dim'])
    def __init__(self, transition, input_dim, hidden_dim, rec_weights_init,
                 ff_weights_init, biases_init, **kwargs):
        super(RecurrentWithFork, self).__init__(**kwargs)
        self.rec_weights_init = rec_weights_init
        self.ff_weights_init = ff_weights_init
        self.biases_init = biases_init
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim

        self.transition = transition
        self.transition.dim = self.hidden_dim
        self.transition.weights_init = self.rec_weights_init
        self.transition.bias_init = self.biases_init

        self.fork = Fork([
            name for name in self.transition.apply.sequences if name != 'mask'
        ],
                         prototype=Linear())
        self.fork.input_dim = self.input_dim
        self.fork.output_dims = [
            self.transition.apply.brick.get_dim(name)
            for name in self.fork.output_names
        ]
        self.fork.weights_init = self.ff_weights_init
        self.fork.biases_init = self.biases_init

        self.children = [transition, self.fork]


#    def _push_allocation_config(self):#
#        #super(RecurrentWithFork, self)._push_allocation_config()
#        self.transition.dim=self.hidden_dim
#        self.fork.input_dim = self.input_dim
#        self.fork.output_dims = [self.transition.apply.brick.get_dim(name)
#                                 for name in self.fork.output_names]

#    def _push_initialization_config(self):
#        #super(RecurrentWithFork, self)._push_initialization_config()
#        self.fork.weights_init=self.ff_weights_init
#        self.fork.biases_init=self.biases_init
#        self.transition.weights_init=self.rec_weights_init
#        self.transition.bias_init=self.biases_init

    @application(inputs=['input_', 'mask'])
    def apply(self, input_, mask=None, **kwargs):
        states = self.transition.apply(mask=mask,
                                       **dict_union(
                                           self.fork.apply(input_,
                                                           as_dict=True),
                                           kwargs))
        # I don't know, why blocks returns a list [states, cell] for LSTM
        # but just states (no list) for GRU or normal RNN. We only want LSTM's states.
        # cells should not be visible from outside.
        return states[0] if isinstance(states, list) else states

    @apply.property('outputs')
    def apply_outputs(self):
        return self.transition.apply.states
Exemplo n.º 12
0
class RecurrentWithFork(Initializable):
    @lazy(allocation=['input_dim'])
    def __init__(self, proto, input_dim, **kwargs):
        super(RecurrentWithFork, self).__init__(**kwargs)
        self.recurrent = proto
        self.input_dim = input_dim
        self.fork = Fork([
            name for name in self.recurrent.apply.sequences if name != 'mask'
        ],
                         prototype=Linear())
        self.children = [self.recurrent, self.fork]

    def _push_allocation_config(self):
        self.fork.input_dim = self.input_dim
        self.fork.output_dims = [
            self.recurrent.get_dim(name) for name in self.fork.output_names
        ]

    @application(inputs=['input_', 'mask'])
    def apply(self, input_, mask=None, **kwargs):
        return self.recurrent.apply(mask=mask,
                                    **dict_union(
                                        self.fork.apply(input_, as_dict=True),
                                        kwargs))

    @apply.property('outputs')
    def apply_outputs(self):
        return self.recurrent.states
Exemplo n.º 13
0
class BidirectionalEncoder(Initializable):
    """Encoder of RNNsearch model."""

    def __init__(self, vocab_size, embedding_dim, state_dim, **kwargs):
        super(BidirectionalEncoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim

        self.lookup = LookupTable(name='words_embeddings')
        self.bidir = BidirectionalWMT15(
            GatedRecurrent(activation=Tanh(), dim=state_dim))
        self.fwd_fork = Fork(
            [name for name in self.bidir.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='words_fwd_fork')
        self.back_fork = Fork(
            [name for name in self.bidir.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='words_back_fork')

        self.children = [self.lookup, self.bidir, self.fwd_fork, self.back_fork]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        self.fwd_fork.input_dim = self.embedding_dim
        self.fwd_fork.output_dims = [self.bidir.children[0].get_dim(name)
                                     for name in self.fwd_fork.output_names]
        self.back_fork.input_dim = self.embedding_dim
        self.back_fork.output_dims = [self.bidir.children[1].get_dim(name)
                                      for name in self.back_fork.output_names]

    @application(inputs=['words', 'words_mask'],
                 outputs=['representation'])
    def apply(self, words, words_mask):
        # Time as first dimension
        words = words.T
        words_mask = words_mask.T

        embeddings = self.lookup.apply(words)
        representation = self.bidir.apply(
            merge(self.fwd_fork.apply(embeddings, as_dict=True),
                  {'mask': words_mask}),
            merge(self.back_fork.apply(embeddings, as_dict=True),
                  {'mask': words_mask})
        )
        return representation
Exemplo n.º 14
0
class BidirectionalEncoder(Initializable):
    """ Bidirectional GRU encoder. """

    def __init__(self, embedding_dim, state_dim, **kwargs):
        super(BidirectionalEncoder, self).__init__(**kwargs)
        # Dimension of the word embeddings taken as input
        self.embedding_dim = embedding_dim
        # Hidden state dimension
        self.state_dim = state_dim

        # The bidir GRU
        self.bidir = BidirectionalFromDict(
            GatedRecurrent(activation=Tanh(), dim=state_dim))
        # Forks to administer the inputs of GRU gates
        self.fwd_fork = Fork(
            [name for name in self.bidir.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='fwd_fork')
        self.back_fork = Fork(
            [name for name in self.bidir.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='back_fork')

        self.children = [self.bidir,
                         self.fwd_fork, self.back_fork]

    def _push_allocation_config(self):
        self.fwd_fork.input_dim = self.embedding_dim
        self.fwd_fork.output_dims = [self.bidir.children[0].get_dim(name)
                                     for name in self.fwd_fork.output_names]
        self.back_fork.input_dim = self.embedding_dim
        self.back_fork.output_dims = [self.bidir.children[1].get_dim(name)
                                      for name in self.back_fork.output_names]


    @application(inputs=['source_sentence_tbf', 'source_sentence_mask_tb'],
                 outputs=['representation'])
    def apply(self, source_sentence_tbf, source_sentence_mask_tb=None):

        representation_tbf = self.bidir.apply(
            merge(self.fwd_fork.apply(source_sentence_tbf, as_dict=True),
                  {'mask': source_sentence_mask_tb}),
            merge(self.back_fork.apply(source_sentence_tbf, as_dict=True),
                  {'mask': source_sentence_mask_tb})
        )
        return representation_tbf
Exemplo n.º 15
0
class InnerRecurrent(BaseRecurrent, Initializable):
    def __init__(self, inner_input_dim, outer_input_dim, inner_dim, **kwargs):
        self.inner_gru = GatedRecurrent(dim=inner_dim, name='inner_gru')

        self.inner_input_fork = Fork(
            output_names=[name for name in self.inner_gru.apply.sequences
                          if 'mask' not in name],
            input_dim=inner_input_dim, name='inner_input_fork')
        self.outer_input_fork = Fork(
            output_names=[name for name in self.inner_gru.apply.sequences
                          if 'mask' not in name],
            input_dim=outer_input_dim, name='inner_outer_fork')

        super(InnerRecurrent, self).__init__(**kwargs)

        self.children = [
            self.inner_gru, self.inner_input_fork, self.outer_input_fork]

    def _push_allocation_config(self):
        self.inner_input_fork.output_dims = self.inner_gru.get_dims(
            self.inner_input_fork.output_names)
        self.outer_input_fork.output_dims = self.inner_gru.get_dims(
            self.outer_input_fork.output_names)

    @recurrent(sequences=['inner_inputs'], states=['states'],
               contexts=['outer_inputs'], outputs=['states'])
    def apply(self, inner_inputs, states, outer_inputs):
        forked_inputs = self.inner_input_fork.apply(inner_inputs, as_dict=True)
        forked_states = self.outer_input_fork.apply(outer_inputs, as_dict=True)

        gru_inputs = {key: forked_inputs[key] + forked_states[key]
                      for key in forked_inputs.keys()}

        new_states = self.inner_gru.apply(
            iterate=False,
            **dict_union(gru_inputs, {'states': states}))
        return new_states  # mean according to the time axis

    def get_dim(self, name):
        if name == 'states':
            return self.inner_gru.get_dim(name)
        else:
            return AttributeError
Exemplo n.º 16
0
class RecurrentWithFork(Initializable):

    @lazy(allocation=['input_dim'])
    def __init__(self, transition, input_dim, hidden_dim,
                 rec_weights_init, ff_weights_init, biases_init, **kwargs):
        super(RecurrentWithFork, self).__init__(**kwargs)
        self.rec_weights_init=rec_weights_init
        self.ff_weights_init=ff_weights_init
        self.biases_init=biases_init
        self.input_dim=input_dim
        self.hidden_dim=hidden_dim

        self.transition=transition
        self.transition.dim=self.hidden_dim
        self.transition.weights_init=self.rec_weights_init
        self.transition.bias_init=self.biases_init


        self.fork = Fork(
            [name for name in self.transition.apply.sequences if name != 'mask'],
             prototype=Linear())
        self.fork.input_dim = self.input_dim
        self.fork.output_dims = [self.transition.apply.brick.get_dim(name)
                                 for name in self.fork.output_names]
        self.fork.weights_init=self.ff_weights_init
        self.fork.biases_init=self.biases_init

        self.children = [transition, self.fork]

#    def _push_allocation_config(self):#
#        #super(RecurrentWithFork, self)._push_allocation_config()
#        self.transition.dim=self.hidden_dim
#        self.fork.input_dim = self.input_dim
#        self.fork.output_dims = [self.transition.apply.brick.get_dim(name)
#                                 for name in self.fork.output_names]

#    def _push_initialization_config(self):
#        #super(RecurrentWithFork, self)._push_initialization_config()
#        self.fork.weights_init=self.ff_weights_init
#        self.fork.biases_init=self.biases_init
#        self.transition.weights_init=self.rec_weights_init
#        self.transition.bias_init=self.biases_init

    @application(inputs=['input_', 'mask'])
    def apply(self, input_, mask=None, **kwargs):
        states=self.transition.apply(
            mask=mask, **dict_union(self.fork.apply(input_, as_dict=True), kwargs))
        # I don't know, why blocks returns a list [states, cell] for LSTM
        # but just states (no list) for GRU or normal RNN. We only want LSTM's states.
        # cells should not be visible from outside.
        return states[0] if isinstance(states,list) else states

    @apply.property('outputs')
    def apply_outputs(self):
        return self.transition.apply.states
Exemplo n.º 17
0
def gru_layer(dim, h, n):
    fork = Fork(
        output_names=["linear" + str(n), "gates" + str(n)],
        name="fork" + str(n),
        input_dim=dim,
        output_dims=[dim, dim * 2],
    )
    gru = GatedRecurrent(dim=dim, name="gru" + str(n))
    initialize([fork, gru])
    linear, gates = fork.apply(h)
    return gru.apply(linear, gates)
Exemplo n.º 18
0
def gru_layer(dim, h, n, x_mask, first, **kwargs):
    fork = Fork(output_names=['linear' + str(n), 'gates' + str(n)],
                name='fork' + str(n),
                input_dim=dim,
                output_dims=[dim, dim * 2])
    gru = GatedRecurrent(dim=dim, name='gru' + str(n))
    initialize([fork, gru])
    linear, gates = fork.apply(h)
    if first:
        gruApply = gru.apply(linear, gates, mask=x_mask, **kwargs)
    else:
        gruApply = gru.apply(linear, gates, **kwargs)
    return gruApply
Exemplo n.º 19
0
class Feedback(Initializable):
    """Feedback.

    Attributes
    ----------
    output_names : list
    output_dims : dict

    """
    @lazy(allocation=['output_names', 'output_dims'])
    def __init__(self,
                 output_names,
                 output_dims,
                 embedding=None,
                 input_dim=0,
                 **kwargs):
        super(Feedback, self).__init__(**kwargs)

        self.output_names = output_names
        self.output_dims = output_dims
        self.input_dim = input_dim

        self.embedding = embedding
        self.fork = Fork(self.output_names)

        self.apply.inputs = ['input']
        self.apply.outputs = output_names

        self.children = [self.embedding, self.fork]
        self.children = [child for child in self.children if child]

    def _push_allocation_config(self):
        if self.fork:
            self.fork.output_dims = self.output_dims
        else:
            self.embedding.output_dim, = self.output_dims
        if self.embedding:
            self.embedding.input_dim = self.input_dim
            self.fork.input_dim = self.embedding.output_dim
        else:
            self.fork.input_dim = self.input_dim

    @application
    def apply(self, symbols):
        embedded_symbols = symbols
        if self.embedding:
            embedded_symbols = self.embedding.apply(symbols)
        if self.fork:
            return self.fork.apply(embedded_symbols)
        return embedded_symbols
Exemplo n.º 20
0
class Feedback(Initializable):
    """Feedback.

    Attributes
    ----------
    output_names : list
    output_dims : dict

    """
    @lazy(allocation=['output_names', 'output_dims'])
    def __init__(self, output_names, output_dims,
                 embedding=None, input_dim=0,
                 **kwargs):
        super(Feedback, self).__init__(**kwargs)

        self.output_names = output_names
        self.output_dims = output_dims
        self.input_dim = input_dim

        self.embedding = embedding
        self.fork = Fork(self.output_names)

        self.apply.inputs = ['input']
        self.apply.outputs = output_names

        self.children = [self.embedding, self.fork]
        self.children = [child for child in self.children if child]

    def _push_allocation_config(self):
        if self.fork:
            self.fork.output_dims = self.output_dims
        else:
            self.embedding.output_dim, = self.output_dims
        if self.embedding:
            self.embedding.input_dim = self.input_dim
            self.fork.input_dim = self.embedding.output_dim
        else:
            self.fork.input_dim = self.input_dim

    @application
    def apply(self, symbols):
        embedded_symbols = symbols
        if self.embedding:
            embedded_symbols = self.embedding.apply(symbols)
        if self.fork:
            return self.fork.apply(embedded_symbols)
        return embedded_symbols
Exemplo n.º 21
0
class Encoder(Initializable):
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 state_dim,
                 reverse=True,
                 **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.reverse = reverse

        self.lookup = LookupTable(name='embeddings')
        self.transition = GatedRecurrent(Tanh(), name='encoder_transition')
        self.fork = Fork([
            name for name in self.transition.apply.sequences if name != 'mask'
        ],
                         prototype=Linear())

        self.children = [self.lookup, self.transition, self.fork]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim
        self.transition.dim = self.state_dim
        self.fork.input_dim = self.embedding_dim
        self.fork.output_dims = [
            self.state_dim for _ in self.fork.output_names
        ]

    @application(inputs=['source_sentence', 'source_sentence_mask'],
                 outputs=['representation'])
    def apply(self, source_sentence, source_sentence_mask):
        # Time as first dimension
        source_sentence = source_sentence.dimshuffle(1, 0)
        source_sentence_mask = source_sentence_mask.T
        if self.reverse:
            source_sentence = source_sentence[::-1]
            source_sentence_mask = source_sentence_mask[::-1]

        embeddings = self.lookup.apply(source_sentence)
        representation = self.transition.apply(
            **merge(self.fork.apply(embeddings, as_dict=True),
                    {'mask': source_sentence_mask}))
        return representation[-1]
Exemplo n.º 22
0
class Encoder(Initializable):
    def __init__(self, vocab_size, embedding_dim, state_dim, reverse=True,
                 **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.reverse = reverse

        self.lookup = LookupTable(name='embeddings')
        self.transition = GatedRecurrent(Tanh(), name='encoder_transition')
        self.fork = Fork([name for name in self.transition.apply.sequences
                          if name != 'mask'], prototype=Linear())

        self.children = [self.lookup, self.transition, self.fork]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim
        self.transition.dim = self.state_dim
        self.fork.input_dim = self.embedding_dim
        self.fork.output_dims = [self.state_dim
                                 for _ in self.fork.output_names]

    @application(inputs=['source_sentence', 'source_sentence_mask'],
                 outputs=['representation'])
    def apply(self, source_sentence, source_sentence_mask):
        # Time as first dimension
        source_sentence = source_sentence.dimshuffle(1, 0)
        source_sentence_mask = source_sentence_mask.T
        if self.reverse:
            source_sentence = source_sentence[::-1]
            source_sentence_mask = source_sentence_mask[::-1]

        embeddings = self.lookup.apply(source_sentence)
        representation = self.transition.apply(**merge(
            self.fork.apply(embeddings, as_dict=True),
            {'mask': source_sentence_mask}
        ))
        return representation[-1]
Exemplo n.º 23
0
def build_fork_lookup(vocab_size, args):
    x = tensor.lmatrix('features')
    virtual_dim = 6
    time_length = 5
    mini_batch_size = 2
    skip_connections = True
    layers = 3

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(virtual_dim)

    print output_names
    print output_dims
    lookup = LookupTable(length=vocab_size, dim=virtual_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names, input_dim=time_length,
                output_dims=output_dims,
                prototype=FeedforwardSequence(
                    [lookup.apply]))

    # Return list of 3D Tensor, one for each layer
    # (Batch X Time X embedding_dim)
    pre_rnn = fork.apply(x)
    fork.initialize()

    f = theano.function([x], pre_rnn)
    return f
Exemplo n.º 24
0
def build_fork_lookup(vocab_size, args):
    x = tensor.lmatrix('features')
    virtual_dim = 6
    time_length = 5
    mini_batch_size = 2
    skip_connections = True
    layers = 3

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(virtual_dim)

    print output_names
    print output_dims
    lookup = LookupTable(length=vocab_size, dim=virtual_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names,
                input_dim=time_length,
                output_dims=output_dims,
                prototype=FeedforwardSequence([lookup.apply]))

    # Return list of 3D Tensor, one for each layer
    # (Batch X Time X embedding_dim)
    pre_rnn = fork.apply(x)
    fork.initialize()

    f = theano.function([x], pre_rnn)
    return f
Exemplo n.º 25
0
class Encoder(Initializable):
    """Encoder of RNNsearch model."""

    def __init__(self, blockid, vocab_size, embedding_dim, state_dim, **kwargs):
        super(Encoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.blockid = blockid

        self.lookup = LookupTable(name='embeddings' + '_' + self.blockid)
        self.gru = GatedRecurrent(activation=Tanh(), dim=state_dim, name = "GatedRNN" + self.blockid)
        self.fwd_fork = Fork(
            [name for name in self.gru.apply.sequences
             if name != 'mask'], prototype=Linear(), name='fwd_fork' + '_' + self.blockid)

        self.children = [self.lookup, self.gru, self.fwd_fork]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        self.fwd_fork.input_dim = self.embedding_dim
        self.fwd_fork.output_dims = [self.gru.get_dim(name)
                                     for name in self.fwd_fork.output_names]

    @application(inputs=['source_sentence', 'source_sentence_mask'],
                 outputs=['representation'])
    def apply(self, source_sentence, source_sentence_mask):
        # Time as first dimension
        source_sentence = source_sentence.T
        source_sentence_mask = source_sentence_mask.T

        embeddings = self.lookup.apply(source_sentence)
        grupara =  merge( self.fwd_fork.apply(embeddings, as_dict=True) , {'mask': source_sentence_mask})
        representation = self.gru.apply(**grupara)
        return representation
Exemplo n.º 26
0
def build_fork_lookup(vocab_size, time_length, args):
    x = tensor.lmatrix('features')
    virtual_dim = 6
    state_dim = 6
    skip_connections = False
    layers = 1

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(virtual_dim)

    lookup = LookupTable(length=vocab_size, dim=virtual_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names, input_dim=time_length,
                output_dims=output_dims,
                prototype=FeedforwardSequence(
                    [lookup.apply]))

    # Note that this order of the periods makes faster modules flow in slower
    # ones with is the opposite of the original paper
    transitions = [ClockworkBase(dim=state_dim, activation=Tanh(),
                                 period=2 ** i) for i in range(layers)]

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # Return list of 3D Tensor, one for each layer
    # (Batch X Time X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give time as the first index for each element in the list:
    # (Time X Batch X embedding_dim)
    if layers > 1 and skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t] = pre_rnn[t].dimshuffle(1, 0, 2)
    else:
        pre_rnn = pre_rnn.dimshuffle(1, 0, 2)

    f_pre_rnn = theano.function([x], pre_rnn)

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            if skip_connections:
                kwargs['inputs' + suffix] = pre_rnn[d]
            else:
                kwargs['inputs' + suffix] = pre_rnn

    print kwargs
    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    f_h = theano.function([x], h)
    return f_pre_rnn, f_h
Exemplo n.º 27
0
class RecurrentEncoder(Initializable):
    def __init__(self, config, output_dim, activation, **kwargs):
        super(RecurrentEncoder, self).__init__(**kwargs)

        self.config = config
        self.context_embedder = ContextEmbedder(config)

        self.rec = SegregatedBidirectional(LSTM(dim=config.rec_state_dim, name='encoder_recurrent'))

        self.fwd_fork = Fork([name for name in self.rec.prototype.apply.sequences if name!='mask'],
                             prototype=Linear(), name='fwd_fork')
        self.bkwd_fork = Fork([name for name in self.rec.prototype.apply.sequences if name!='mask'],
                              prototype=Linear(), name='bkwd_fork')

        rto_in = config.rec_state_dim * 2 + sum(x[2] for x in config.dim_embeddings)
        self.rec_to_output = MLP(
                    activations=[Rectifier() for _ in config.dim_hidden] + [activation],
                    dims=[rto_in] + config.dim_hidden + [output_dim],
                    name='encoder_rto')

        self.children = [self.context_embedder, self.rec, self.fwd_fork, self.bkwd_fork, self.rec_to_output]

        self.rec_inputs = ['latitude', 'longitude', 'latitude_mask']
        self.inputs = self.context_embedder.inputs + self.rec_inputs

    def _push_allocation_config(self):
        for i, fork in enumerate([self.fwd_fork, self.bkwd_fork]):
            fork.input_dim = 2
            fork.output_dims = [ self.rec.children[i].get_dim(name)
                                 for name in fork.output_names ]

    def _push_initialization_config(self):
        for brick in self.children:
            brick.weights_init = self.config.weights_init
            brick.biases_init = self.config.biases_init

    @application
    def apply(self, latitude, longitude, latitude_mask, **kwargs):
        latitude = (latitude.T - data.train_gps_mean[0]) / data.train_gps_std[0]
        longitude = (longitude.T - data.train_gps_mean[1]) / data.train_gps_std[1]
        latitude_mask = latitude_mask.T

        rec_in = tensor.concatenate((latitude[:, :, None], longitude[:, :, None]),
                                    axis=2)
        path = self.rec.apply(merge(self.fwd_fork.apply(rec_in, as_dict=True),
                                    {'mask': latitude_mask}),
                              merge(self.bkwd_fork.apply(rec_in, as_dict=True),
                                    {'mask': latitude_mask}))[0]

        last_id = tensor.cast(latitude_mask.sum(axis=0) - 1, dtype='int64')
        
        path_representation = (path[0][:, -self.config.rec_state_dim:],
                path[last_id - 1, tensor.arange(last_id.shape[0])]
                    [:, :self.config.rec_state_dim])

        embeddings = tuple(self.context_embedder.apply(
                            **{k: kwargs[k] for k in self.context_embedder.inputs }))

        inputs = tensor.concatenate(path_representation + embeddings, axis=1)
        outputs = self.rec_to_output.apply(inputs)

        return outputs

    @apply.property('inputs')
    def apply_inputs(self):
        return self.inputs
Exemplo n.º 28
0
class Scribe(Initializable):
    def __init__(self,
                 k=20,
                 rec_h_dim=400,
                 att_size=10,
                 num_letters=68,
                 sampling_bias=0.,
                 attention_type="graves",
                 epsilon=1e-6,
                 attention_alignment=1.,
                 **kwargs):
        super(Scribe, self).__init__(**kwargs)

        # For now only softmax and graves are supported.
        assert attention_type in ["graves", "softmax"]

        readouts_dim = 1 + 6 * k

        self.k = k
        self.rec_h_dim = rec_h_dim
        self.att_size = att_size
        self.num_letters = num_letters
        self.sampling_bias = sampling_bias
        self.attention_type = attention_type
        self.epsilon = epsilon
        self.attention_alignment = attention_alignment

        self.cell1 = GatedRecurrent(dim=rec_h_dim, name='cell1')

        self.inp_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'],
                              input_dim=3,
                              output_dims=[rec_h_dim, 2 * rec_h_dim],
                              name='inp_to_h1')

        self.h1_to_readout = Linear(input_dim=rec_h_dim,
                                    output_dim=readouts_dim,
                                    name='h1_to_readout')

        self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'],
                              input_dim=rec_h_dim,
                              output_dims=[att_size] * 3,
                              name='h1_to_att')

        self.att_to_h1 = Fork(output_names=['cell1_inputs', 'cell1_gates'],
                              input_dim=num_letters,
                              output_dims=[rec_h_dim, 2 * rec_h_dim],
                              name='att_to_h1')

        self.att_to_readout = Linear(input_dim=num_letters,
                                     output_dim=readouts_dim,
                                     name='att_to_readout')

        self.emitter = BivariateGMMEmitter(k=k, sampling_bias=sampling_bias)

        self.children = [
            self.cell1, self.inp_to_h1, self.h1_to_readout, self.h1_to_att,
            self.att_to_h1, self.att_to_readout, self.emitter
        ]

    def _allocate(self):
        self.initial_w = shared_floatx_zeros((self.num_letters, ),
                                             name="initial_w")

        add_role(self.initial_w, INITIAL_STATE)

    def symbolic_input_variables(self):
        data = tensor.tensor3('features')
        data_mask = tensor.matrix('features_mask')
        context = tensor.imatrix('transcripts')
        context_mask = tensor.matrix('transcripts_mask')
        start_flag = tensor.scalar('start_flag')

        return data, data_mask, context, context_mask, start_flag

    def initial_states(self, batch_size):
        initial_h1 = self.cell1.initial_states(batch_size)
        initial_kappa = shared_floatx_zeros((batch_size, self.att_size))
        initial_w = tensor.repeat(self.initial_w[None, :], batch_size, 0)
        last_h1 = shared_floatx_zeros((batch_size, self.rec_h_dim))
        last_w = shared_floatx_zeros((batch_size, self.num_letters))
        use_last_states = shared(numpy.asarray(0., dtype=floatX))

        return initial_h1, initial_kappa, initial_w, \
            last_h1, last_w, use_last_states

    @application
    def compute_cost(self, data, data_mask, context, context_mask, start_flag,
                     batch_size):
        x = data[:-1]
        target = data[1:]
        mask = data_mask[1:]
        xinp_h1, xgat_h1 = self.inp_to_h1.apply(x)
        context_oh = one_hot(context, self.num_letters) * \
            tensor.shape_padright(context_mask)

        initial_h1, initial_kappa, initial_w, \
            last_h1, last_w, use_last_states = \
            self.initial_states(batch_size)

        input_h1 = tensor.switch(use_last_states, last_h1, initial_h1)
        input_w = tensor.switch(use_last_states, last_w, initial_w)

        u = tensor.shape_padleft(tensor.arange(context.shape[1], dtype=floatX),
                                 2)

        def step(xinp_h1_t, xgat_h1_t, h1_tm1, k_tm1, w_tm1, ctx):

            attinp_h1, attgat_h1 = self.att_to_h1.apply(w_tm1)

            h1_t = self.cell1.apply(xinp_h1_t + attinp_h1,
                                    xgat_h1_t + attgat_h1,
                                    h1_tm1,
                                    iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t)
            else:
                a_t = tensor.exp(a_t)

            b_t = tensor.exp(b_t) + self.epsilon
            k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t)

            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t *
                                                        (k_t_ - u)**2),
                    axis=1)
            else:
                phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2),
                                   axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * ctx).sum(axis=1)

            return h1_t, k_t, w_t

        (h1, kappa, w), scan_updates = theano.scan(
            fn=step,
            sequences=[xinp_h1, xgat_h1],
            non_sequences=[context_oh],
            outputs_info=[input_h1, initial_kappa, input_w])

        readouts = self.h1_to_readout.apply(h1) + \
            self.att_to_readout.apply(w)

        cost = self.emitter.cost(readouts, target)
        cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag

        updates = []
        updates.append((last_h1, h1[-1]))
        updates.append((initial_kappa,
                        tensor.switch(start_flag, 0. * initial_kappa,
                                      kappa[-1])))
        updates.append((last_w, w[-1]))
        updates.append((use_last_states, 1. - start_flag))

        return cost, scan_updates + updates

    @application
    def sample_model(self, context, context_mask, n_steps, batch_size):

        initial_h1, initial_kappa, initial_w, \
            last_h1, last_w, use_last_states = \
            self.initial_states(batch_size)

        initial_x = self.emitter.initial_outputs(batch_size)

        context_oh = one_hot(context, self.num_letters) * \
            tensor.shape_padright(context_mask)

        u = tensor.shape_padleft(tensor.arange(context.shape[1], dtype=floatX),
                                 2)

        def sample_step(x_tm1, h1_tm1, k_tm1, w_tm1, ctx):
            xinp_h1_t, xgat_h1_t = self.inp_to_h1.apply(x_tm1)

            attinp_h1, attgat_h1 = self.att_to_h1.apply(w_tm1)

            h1_t = self.cell1.apply(xinp_h1_t + attinp_h1,
                                    xgat_h1_t + attgat_h1,
                                    h1_tm1,
                                    iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t)
            else:
                a_t = tensor.exp(a_t)

            b_t = tensor.exp(b_t) + self.epsilon
            k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t)

            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t *
                                                        (k_t_ - u)**2),
                    axis=1)
            else:
                phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2),
                                   axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * ctx).sum(axis=1)

            readout_t = self.h1_to_readout.apply(h1_t) + \
                self.att_to_readout.apply(w_t)

            x_t = self.emitter.emit(readout_t)

            mu_t, sigma_t, corr_t, pi_t, penup_t = \
                self.emitter.components(readout_t)

            return x_t, h1_t, k_t, w_t, pi_t, phi_t, a_t

        (sample_x, h1, k, w, pi, phi,
         pi_att), updates = theano.scan(fn=sample_step,
                                        n_steps=n_steps,
                                        sequences=[],
                                        non_sequences=[context_oh],
                                        outputs_info=[
                                            initial_x.eval(), initial_h1,
                                            initial_kappa, initial_w, None,
                                            None, None
                                        ])

        return sample_x, pi, phi, pi_att, updates
def build_model_vanilla(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names, input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence(
                    [lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())
                   for _ in range(layers)]

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # If skip_connections: dim = layers * state_dim
    # else: dim = state_dim
    output_layer = Linear(
        input_dim=skip_connections * layers *
        state_dim + (1 - skip_connections) * state_dim,
        output_dim=vocab_size, name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs'] = pre_rnn
        init_states[d] = theano.shared(
            numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX),
            name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # We have
    # h = [state, state_1, state_2 ...] if layers > 1
    # h = state if layers == 1

    # If we have skip connections, concatenate all the states
    # Else only consider the state of the highest layer
    last_states = {}
    if layers > 1:
        # Save all the last states
        for d in range(layers):
            last_states[d] = h[d][-1, :, :]
        if skip_connections:
            h = tensor.concatenate(h, axis=2)
        else:
            h = h[-1]
    else:
        last_states[0] = h[-1, :, :]
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(),
        presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates
Exemplo n.º 30
0
class BidirectionalEncoder(Initializable):
    """A generalized version of the vanilla encoder of the RNNsearch 
    model which supports different numbers of layers. Zero layers 
    represent non-recurrent encoders.
    """
    def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections,
                 state_dim, **kwargs):
        """Sole constructor.
        
        Args:
            vocab_size (int): Source vocabulary size
            embedding_dim (int): Dimension of the embedding layer
            n_layers (int): Number of layers. Layers share the same
                            weight matrices.
            skip_connections (bool): Skip connections connect the
                                     source word embeddings directly 
                                     with deeper layers to propagate 
                                     the gradient more efficiently
            state_dim (int): Number of hidden units in the recurrent
                             layers.
        """
        super(BidirectionalEncoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.state_dim = state_dim
        self.skip_connections = skip_connections

        self.lookup = LookupTable(name='embeddings')
        if self.n_layers >= 1:
            self.bidir = BidirectionalWMT15(
                GatedRecurrent(activation=Tanh(), dim=state_dim))
            self.fwd_fork = Fork([
                name for name in self.bidir.prototype.apply.sequences
                if name != 'mask'
            ],
                                 prototype=Linear(),
                                 name='fwd_fork')
            self.back_fork = Fork([
                name for name in self.bidir.prototype.apply.sequences
                if name != 'mask'
            ],
                                  prototype=Linear(),
                                  name='back_fork')
            self.children = [
                self.lookup, self.bidir, self.fwd_fork, self.back_fork
            ]
            if self.n_layers > 1:  # Deep encoder
                self.mid_fwd_fork = Fork([
                    name for name in self.bidir.prototype.apply.sequences
                    if name != 'mask'
                ],
                                         prototype=Linear(),
                                         name='mid_fwd_fork')
                self.mid_back_fork = Fork([
                    name for name in self.bidir.prototype.apply.sequences
                    if name != 'mask'
                ],
                                          prototype=Linear(),
                                          name='mid_back_fork')
                self.children.append(self.mid_fwd_fork)
                self.children.append(self.mid_back_fork)
        elif self.n_layers == 0:
            self.embedding_dim = state_dim * 2
            self.children = [self.lookup]
        else:
            logging.fatal("Number of encoder layers must be non-negative")

    def _push_allocation_config(self):
        """Sets the parameters of sub bricks """
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        if self.n_layers >= 1:
            self.fwd_fork.input_dim = self.embedding_dim
            self.fwd_fork.output_dims = [
                self.bidir.children[0].get_dim(name)
                for name in self.fwd_fork.output_names
            ]
            self.back_fork.input_dim = self.embedding_dim
            self.back_fork.output_dims = [
                self.bidir.children[1].get_dim(name)
                for name in self.back_fork.output_names
            ]
            if self.n_layers > 1:  # Deep encoder
                inp_dim = self.state_dim * 2
                if self.skip_connections:
                    inp_dim += self.embedding_dim
                self.mid_fwd_fork.input_dim = inp_dim
                self.mid_fwd_fork.output_dims = [
                    self.bidir.children[0].get_dim(name)
                    for name in self.fwd_fork.output_names
                ]
                self.mid_back_fork.input_dim = inp_dim
                self.mid_back_fork.output_dims = [
                    self.bidir.children[1].get_dim(name)
                    for name in self.back_fork.output_names
                ]

    @application(inputs=['source_sentence', 'source_sentence_mask'],
                 outputs=['representation', 'representation_mask'])
    def apply(self, source_sentence, source_sentence_mask):
        """Produces source annotations, either non-recurrently or with
        a bidirectional RNN architecture.
        """
        # Time as first dimension
        source_sentence = source_sentence.T
        source_sentence_mask = source_sentence_mask.T

        embeddings = self.lookup.apply(source_sentence)

        if self.n_layers >= 1:
            representation = self.bidir.apply(
                merge(self.fwd_fork.apply(embeddings, as_dict=True),
                      {'mask': source_sentence_mask}),
                merge(self.back_fork.apply(embeddings, as_dict=True),
                      {'mask': source_sentence_mask}))
            for _ in xrange(self.n_layers - 1):
                if self.skip_connections:
                    inp = tensor.concatenate([representation, embeddings],
                                             axis=2)
                else:
                    inp = representation
                representation = self.bidir.apply(
                    merge(self.mid_fwd_fork.apply(inp, as_dict=True),
                          {'mask': source_sentence_mask}),
                    merge(self.mid_back_fork.apply(inp, as_dict=True),
                          {'mask': source_sentence_mask}))
        else:
            representation = embeddings
        return representation, source_sentence_mask
Exemplo n.º 31
0
class BidirectionalEncoderSigmoid(Initializable):
    """Encoder of RNNsearch model."""
    def __init__(self, embedding_dim, state_dim, **kwargs):
        super(BidirectionalEncoderSigmoid, self).__init__(**kwargs)
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim

        curSeed = 1791095845
        self.rng = numpy.random.RandomState(curSeed)

        self.bidir = BidirectionalWMT15(
            GatedRecurrentWithZerosAtMask(activation=Logistic(),
                                          dim=state_dim))
        self.fwd_fork = Fork([
            name
            for name in self.bidir.prototype.apply.sequences if name != 'mask'
        ],
                             prototype=Linear(),
                             name='fwd_fork')
        self.back_fork = Fork([
            name
            for name in self.bidir.prototype.apply.sequences if name != 'mask'
        ],
                              prototype=Linear(),
                              name='back_fork')

        #self.children = [self.lookup, self.bidir,
        self.children = [self.bidir, self.fwd_fork, self.back_fork]

        self._push_allocation_config(
        )  # maybe not necessary? (maybe only necessary for decoder)

        print "RNN seed: " + str(self.rng.get_state()[1][0])
        # initialization of parameters
        self.weights_init = IsotropicGaussian()
        self.biases_init = Constant(0)
        self.push_initialization_config()
        self.bidir.prototype.weights_init = Orthogonal()
        self.initialize()

    def _push_allocation_config(self):
        self.fwd_fork.input_dim = self.embedding_dim
        self.fwd_fork.output_dims = [
            self.bidir.children[0].get_dim(name)
            for name in self.fwd_fork.output_names
        ]
        self.back_fork.input_dim = self.embedding_dim
        self.back_fork.output_dims = [
            self.bidir.children[1].get_dim(name)
            for name in self.back_fork.output_names
        ]

    @application(inputs=['source_sentence', 'source_sentence_mask'],
                 outputs=['representation'])
    def apply(self, source_sentence, source_sentence_mask):
        # Time as first dimension
        source_sentence = source_sentence.T
        source_sentence_mask = source_sentence_mask.T

        embeddings = source_sentence

        representation = self.bidir.apply(
            # Conversion to embedding representation here.
            # TODO: Less than the current number of dimensions should be totally fine.
            merge(self.fwd_fork.apply(embeddings, as_dict=True),
                  {'mask': source_sentence_mask}),
            merge(self.back_fork.apply(embeddings, as_dict=True),
                  {'mask': source_sentence_mask}))
        self.representation = representation
        return representation
Exemplo n.º 32
0
class BidirectionalEncoder(Initializable):
    """A generalized version of the vanilla encoder of the RNNsearch 
    model which supports different numbers of layers. Zero layers 
    represent non-recurrent encoders.
    """

    def __init__(self, 
                 vocab_size, 
                 embedding_dim, 
                 n_layers, 
                 skip_connections, 
                 state_dim, 
                 **kwargs):
        """Sole constructor.
        
        Args:
            vocab_size (int): Source vocabulary size
            embedding_dim (int): Dimension of the embedding layer
            n_layers (int): Number of layers. Layers share the same
                            weight matrices.
            skip_connections (bool): Skip connections connect the
                                     source word embeddings directly 
                                     with deeper layers to propagate 
                                     the gradient more efficiently
            state_dim (int): Number of hidden units in the recurrent
                             layers.
        """
        super(BidirectionalEncoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.n_layers = n_layers
        self.state_dim = state_dim
        self.skip_connections = skip_connections

        self.lookup = LookupTable(name='embeddings')
        if self.n_layers >= 1:
            self.bidir = BidirectionalWMT15(
                GatedRecurrent(activation=Tanh(), dim=state_dim))
            self.fwd_fork = Fork(
                [name for name in self.bidir.prototype.apply.sequences
                 if name != 'mask'], prototype=Linear(), name='fwd_fork')
            self.back_fork = Fork(
                [name for name in self.bidir.prototype.apply.sequences
                 if name != 'mask'], prototype=Linear(), name='back_fork')
            self.children = [self.lookup, self.bidir,
                             self.fwd_fork, self.back_fork]
            if self.n_layers > 1: # Deep encoder
                self.mid_fwd_fork = Fork(
                    [name for name in self.bidir.prototype.apply.sequences
                     if name != 'mask'], prototype=Linear(), name='mid_fwd_fork')
                self.mid_back_fork = Fork(
                    [name for name in self.bidir.prototype.apply.sequences
                     if name != 'mask'], prototype=Linear(), name='mid_back_fork')
                self.children.append(self.mid_fwd_fork)
                self.children.append(self.mid_back_fork)
        elif self.n_layers == 0:
            self.embedding_dim = state_dim*2
            self.children = [self.lookup]
        else:
            logging.fatal("Number of encoder layers must be non-negative")

    def _push_allocation_config(self):
        """Sets the parameters of sub bricks """
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        if self.n_layers >= 1:
            self.fwd_fork.input_dim = self.embedding_dim
            self.fwd_fork.output_dims = [self.bidir.children[0].get_dim(name)
                                     for name in self.fwd_fork.output_names]
            self.back_fork.input_dim = self.embedding_dim
            self.back_fork.output_dims = [self.bidir.children[1].get_dim(name)
                                      for name in self.back_fork.output_names]
            if self.n_layers > 1: # Deep encoder
                inp_dim = self.state_dim * 2
                if self.skip_connections:
                    inp_dim += self.embedding_dim
                self.mid_fwd_fork.input_dim = inp_dim
                self.mid_fwd_fork.output_dims = [
                                        self.bidir.children[0].get_dim(name)
                                        for name in self.fwd_fork.output_names]
                self.mid_back_fork.input_dim = inp_dim
                self.mid_back_fork.output_dims = [
                                        self.bidir.children[1].get_dim(name)
                                        for name in self.back_fork.output_names]

    @application(inputs=['source_sentence', 'source_sentence_mask'],
                 outputs=['representation', 'representation_mask'])
    def apply(self, source_sentence, source_sentence_mask):
        """Produces source annotations, either non-recurrently or with
        a bidirectional RNN architecture.
        """
        # Time as first dimension
        source_sentence = source_sentence.T
        source_sentence_mask = source_sentence_mask.T

        embeddings = self.lookup.apply(source_sentence)

        if self.n_layers >= 1:
            representation = self.bidir.apply(
                merge(self.fwd_fork.apply(embeddings, as_dict=True),
                      {'mask': source_sentence_mask}),
                merge(self.back_fork.apply(embeddings, as_dict=True),
                      {'mask': source_sentence_mask})
            )
            for _ in xrange(self.n_layers-1):
                if self.skip_connections:
                    inp = tensor.concatenate([representation, embeddings],
                                             axis=2)
                else:
                    inp = representation
                representation = self.bidir.apply(
                    merge(self.mid_fwd_fork.apply(inp, as_dict=True),
                          {'mask': source_sentence_mask}),
                    merge(self.mid_back_fork.apply(inp, as_dict=True),
                          {'mask': source_sentence_mask})
                )
        else:
            representation = embeddings
        return representation, source_sentence_mask
Exemplo n.º 33
0
class NoLookupEncoder(Initializable):
    """This is a variation of ``BidirectionalEncoder`` which works with
    sparse feature maps. It does not use a lookup table but directly 
    feeds the predefined distributed representations into the encoder
    network."""

    def __init__(self, embedding_dim, state_dim, **kwargs):
        """Constructor. Note that this implementation only supports
        single layer architectures.
        
        Args:
            embedding_dim (int): Dimensionality of the word vectors
                                 defined by the sparse feature map.
            state_dim (int): Size of the recurrent layer.
        """
        super(NoLookupEncoder, self).__init__(**kwargs)
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.bidir = BidirectionalWMT15(
            GatedRecurrent(activation=Tanh(), dim=state_dim))
        self.fwd_fork = Fork(
            [name for name in self.bidir.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='fwd_fork')
        self.back_fork = Fork(
            [name for name in self.bidir.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='back_fork')
        self.children = [self.bidir,
                         self.fwd_fork, self.back_fork]

    def _push_allocation_config(self):
        """Sets the dimensions of the forward and backward forks. """
        self.fwd_fork.input_dim = self.embedding_dim
        self.fwd_fork.output_dims = [self.bidir.children[0].get_dim(name)
                                     for name in self.fwd_fork.output_names]
        self.back_fork.input_dim = self.embedding_dim
        self.back_fork.output_dims = [self.bidir.children[1].get_dim(name)
                                      for name in self.back_fork.output_names]

    @application(inputs=['source_sentence', 'source_sentence_mask'],
                 outputs=['representation', 'representation_mask'])
    def apply(self, source_sentence, source_sentence_mask):
        """Creates bidirectional RNN source annotations.
        
        Args:
            source_sentence (Variable): Source sentence with words in
                                        vector representation.
            source_sentence_mask (Variable): Source mask
        
        Returns:
            Variable. source annotations
        """
        # Time as first dimension
        source_sentence = source_sentence.T
        source_sentence_mask = source_sentence_mask.T

        representation = self.bidir.apply(
            merge(self.fwd_fork.apply(source_sentence, as_dict=True),
                  {'mask': source_sentence_mask}),
            merge(self.back_fork.apply(source_sentence, as_dict=True),
                  {'mask': source_sentence_mask})
        )
        return representation, source_sentence_mask
Exemplo n.º 34
0
class SimplePyramidLayer(Initializable):
    """Basic unit for the pyramid model.

    """
    def __init__(self,
				 batch_size,
				 frame_size,
				 k,
				 depth,
				 size,
				  **kwargs):
		super(SimplePyramidLayer, self).__init__(**kwargs)

		target_size = frame_size * k

		depth_x = depth
		hidden_size_mlp_x = 32*size

		depth_transition = depth-1

		depth_theta = depth
		hidden_size_mlp_theta = 32*size
		hidden_size_recurrent = 32*size*3

		activations_x = [Rectifier()]*depth_x

		dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \
		         [4*hidden_size_recurrent]

		activations_theta = [Rectifier()]*depth_theta

		dims_theta = [hidden_size_recurrent] + \
		             [hidden_size_mlp_theta]*depth_theta

		self.mlp_x = MLP(activations = activations_x,
		            dims = dims_x,
		            name = "mlp_x")

		transition = [GatedRecurrent(dim=hidden_size_recurrent, 
		                   use_bias = True,
		                   name = "gru_{}".format(i) ) for i in range(depth_transition)]

		self.transition = RecurrentStack( transition,
		            name="transition", skip_connections = True)

		mlp_theta = MLP( activations = activations_theta,
		             dims = dims_theta,
		             name = "mlp_theta")

		mlp_gmm = GMMMLP(mlp = mlp_theta,
		                  dim = target_size,
		                  k = k,
		                  const = 0.00001,
		                  name = "gmm_wrap")

		self.gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm,
		  output_size = frame_size, k = k)

		normal_inputs = [name for name in self.transition.apply.sequences
		                 if 'mask' not in name]

		self.fork = Fork(normal_inputs,
						 input_dim = 4*hidden_size_recurrent,
						 output_dims = self.transition.get_dims(normal_inputs))

		self.children = [self.mlp_x, self.transition,
		                 self.gmm_emitter, self.fork]

    def monitoring_vars(self, cg):

        mu, sigma, coeff = VariableFilter(
        	applications = [self.gmm_emitter.gmmmlp.apply],
        	name_regex = "output")(cg.variables)

        min_sigma = sigma.min().copy(name="sigma_min")
        mean_sigma = sigma.mean().copy(name="sigma_mean")
        max_sigma = sigma.max().copy(name="sigma_max")

        min_mu = mu.min().copy(name="mu_min")
        mean_mu = mu.mean().copy(name="mu_mean")
        max_mu = mu.max().copy(name="mu_max")

        monitoring_vars = [mean_sigma, min_sigma,
            min_mu, max_mu, mean_mu, max_sigma]

        return monitoring_vars

    @application
    def cost(self, x, context, **kwargs):
        x_g = self.mlp_x.apply(context)
        inputs = self.fork.apply(x_g, as_dict = True)
        h = self.transition.apply(**dict_union(inputs, kwargs))

        self.final_states = []
        for var in h:
        	self.final_states.append(var[-1].copy(name = var.name + "_final_value"))

        cost = self.gmm_emitter.cost(h[-1], x)
        return cost.mean()

    @application
    def generate(context):
        x_g = self.mlp_x.apply(context)
        inputs = self.fork.apply(x_g, as_dict = True)
        h = self.transition.apply(**dict_union(inputs, kwargs))
        return self.gmm_emitter.emit(h[-1])
Exemplo n.º 35
0
def main(mode, save_path, num_batches, from_dump):
    if mode == "train":
        # Experiment configuration
        dimension = 100
        readout_dimension = len(char2code)

        # Data processing pipeline
        data_stream = DataStreamMapping(
            mapping=lambda data: tuple(array.T for array in data),
            data_stream=PaddingDataStream(
                BatchDataStream(
                    iteration_scheme=ConstantScheme(10),
                    data_stream=DataStreamMapping(
                        mapping=reverse_words,
                        add_sources=("targets", ),
                        data_stream=DataStreamFilter(
                            predicate=lambda data: len(data[0]) <= 100,
                            data_stream=OneBillionWord(
                                "training", [99],
                                char2code,
                                level="character",
                                preprocess=str.lower).get_default_stream())))))

        # Build the model
        chars = tensor.lmatrix("features")
        chars_mask = tensor.matrix("features_mask")
        targets = tensor.lmatrix("targets")
        targets_mask = tensor.matrix("targets_mask")

        encoder = Bidirectional(GatedRecurrent(dim=dimension,
                                               activation=Tanh()),
                                weights_init=Orthogonal())
        encoder.initialize()
        fork = Fork([
            name
            for name in encoder.prototype.apply.sequences if name != 'mask'
        ],
                    weights_init=IsotropicGaussian(0.1),
                    biases_init=Constant(0))
        fork.input_dim = dimension
        fork.fork_dims = {name: dimension for name in fork.fork_names}
        fork.initialize()
        lookup = LookupTable(readout_dimension,
                             dimension,
                             weights_init=IsotropicGaussian(0.1))
        lookup.initialize()
        transition = Transition(activation=Tanh(),
                                dim=dimension,
                                attended_dim=2 * dimension,
                                name="transition")
        attention = SequenceContentAttention(
            state_names=transition.apply.states,
            match_dim=dimension,
            name="attention")
        readout = LinearReadout(readout_dim=readout_dimension,
                                source_names=["states"],
                                emitter=SoftmaxEmitter(name="emitter"),
                                feedbacker=LookupFeedback(
                                    readout_dimension, dimension),
                                name="readout")
        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      weights_init=IsotropicGaussian(0.1),
                                      biases_init=Constant(0),
                                      name="generator")
        generator.push_initialization_config()
        transition.weights_init = Orthogonal()
        generator.initialize()
        bricks = [encoder, fork, lookup, generator]

        # Give an idea of what's going on
        params = Selector(bricks).get_params()
        logger.info("Parameters:\n" +
                    pprint.pformat([(key, value.get_value().shape)
                                    for key, value in params.items()],
                                   width=120))

        # Build the cost computation graph
        batch_cost = generator.cost(
            targets,
            targets_mask,
            attended=encoder.apply(**dict_union(fork.apply(
                lookup.lookup(chars), return_dict=True),
                                                mask=chars_mask)),
            attended_mask=chars_mask).sum()
        batch_size = named_copy(chars.shape[1], "batch_size")
        cost = aggregation.mean(batch_cost, batch_size)
        cost.name = "sequence_log_likelihood"
        logger.info("Cost graph is built")

        # Fetch variables useful for debugging
        max_length = named_copy(chars.shape[0], "max_length")
        cost_per_character = named_copy(
            aggregation.mean(batch_cost, batch_size * max_length),
            "character_log_likelihood")
        cg = ComputationGraph(cost)
        energies = unpack(VariableFilter(application=readout.readout,
                                         name="output")(cg.variables),
                          singleton=True)
        min_energy = named_copy(energies.min(), "min_energy")
        max_energy = named_copy(energies.max(), "max_energy")
        (activations, ) = VariableFilter(
            application=generator.transition.apply,
            name="states")(cg.variables)
        mean_activation = named_copy(activations.mean(), "mean_activation")

        # Define the training algorithm.
        algorithm = GradientDescent(cost=cost,
                                    step_rule=CompositeRule([
                                        GradientClipping(10.0),
                                        SteepestDescent(0.01)
                                    ]))

        observables = [
            cost, min_energy, max_energy, mean_activation, batch_size,
            max_length, cost_per_character, algorithm.total_step_norm,
            algorithm.total_gradient_norm
        ]
        for name, param in params.items():
            observables.append(named_copy(param.norm(2), name + "_norm"))
            observables.append(
                named_copy(algorithm.gradients[param].norm(2),
                           name + "_grad_norm"))

        main_loop = MainLoop(
            model=bricks,
            data_stream=data_stream,
            algorithm=algorithm,
            extensions=([LoadFromDump(from_dump)] if from_dump else []) + [
                Timing(),
                TrainingDataMonitoring(observables, after_every_batch=True),
                TrainingDataMonitoring(
                    observables, prefix="average", every_n_batches=10),
                FinishAfter(after_n_batches=num_batches).add_condition(
                    "after_batch", lambda log: math.isnan(
                        log.current_row.total_gradient_norm)),
                Plot(os.path.basename(save_path),
                     [["average_" + cost.name],
                      ["average_" + cost_per_character.name]],
                     every_n_batches=10),
                SerializeMainLoop(save_path,
                                  every_n_batches=500,
                                  save_separately=["model", "log"]),
                Printing(every_n_batches=1)
            ])
        main_loop.run()
    elif mode == "test":
        with open(save_path, "rb") as source:
            encoder, fork, lookup, generator = dill.load(source)
        logger.info("Model is loaded")
        chars = tensor.lmatrix("features")
        generated = generator.generate(
            n_steps=3 * chars.shape[0],
            batch_size=chars.shape[1],
            attended=encoder.apply(**dict_union(
                fork.apply(lookup.lookup(chars), return_dict=True))),
            attended_mask=tensor.ones(chars.shape))
        sample_function = ComputationGraph(generated).get_theano_function()
        logging.info("Sampling function is compiled")

        while True:
            # Python 2-3 compatibility
            line = input("Enter a sentence\n")
            batch_size = int(input("Enter a number of samples\n"))
            encoded_input = [
                char2code.get(char, char2code["<UNK>"])
                for char in line.lower().strip()
            ]
            encoded_input = ([char2code['<S>']] + encoded_input +
                             [char2code['</S>']])
            print("Encoder input:", encoded_input)
            target = reverse_words((encoded_input, ))[0]
            print("Target: ", target)
            states, samples, glimpses, weights, costs = sample_function(
                numpy.repeat(numpy.array(encoded_input)[:, None],
                             batch_size,
                             axis=1))

            messages = []
            for i in range(samples.shape[1]):
                sample = list(samples[:, i])
                try:
                    true_length = sample.index(char2code['</S>']) + 1
                except ValueError:
                    true_length = len(sample)
                sample = sample[:true_length]
                cost = costs[:true_length, i].sum()
                message = "({})".format(cost)
                message += "".join(code2char[code] for code in sample)
                if sample == target:
                    message += " CORRECT!"
                messages.append((cost, message))
            messages.sort(key=lambda tuple_: -tuple_[0])
            for _, message in messages:
                print(message)
Exemplo n.º 36
0
def main():
    nvis, nhid, nlat, learn_prior = 784, 200, 100, False
    theano_rng = MRG_RandomStreams(134663)

    # Initialize prior
    prior_mu = shared_floatx(numpy.zeros(nlat), name='prior_mu')
    prior_log_sigma = shared_floatx(numpy.zeros(nlat), name='prior_log_sigma')
    if learn_prior:
        add_role(prior_mu, PARAMETER)
        add_role(prior_log_sigma, PARAMETER)

    # Initialize encoding network
    encoding_network = MLP(activations=[Rectifier()],
                           dims=[nvis, nhid],
                           weights_init=IsotropicGaussian(std=0.001),
                           biases_init=Constant(0))
    encoding_network.initialize()
    encoding_parameter_mapping = Fork(
        output_names=['mu_phi', 'log_sigma_phi'], input_dim=nhid,
        output_dims=dict(mu_phi=nlat, log_sigma_phi=nlat), prototype=Linear(),
        weights_init=IsotropicGaussian(std=0.001), biases_init=Constant(0))
    encoding_parameter_mapping.initialize()

    # Initialize decoding network
    decoding_network = MLP(activations=[Rectifier()],
                           dims=[nlat, nhid],
                           weights_init=IsotropicGaussian(std=0.001),
                           biases_init=Constant(0))
    decoding_network.initialize()
    decoding_parameter_mapping = Linear(
        input_dim=nhid, output_dim=nvis, name='mu_theta',
        weights_init=IsotropicGaussian(std=0.001),
        biases_init=Constant(0))
    decoding_parameter_mapping.initialize()

    # Encode / decode
    x = tensor.matrix('features')
    h_phi = encoding_network.apply(x)
    mu_phi, log_sigma_phi = encoding_parameter_mapping.apply(h_phi)
    epsilon = theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype)
    epsilon.name = 'epsilon'
    z = mu_phi + epsilon * tensor.exp(log_sigma_phi)
    z.name = 'z'
    h_theta = decoding_network.apply(z)
    mu_theta = decoding_parameter_mapping.apply(h_theta)

    # Compute cost
    kl_term = (
        prior_log_sigma - log_sigma_phi
        + 0.5 * (
            tensor.exp(2 * log_sigma_phi) + (mu_phi - prior_mu) ** 2
        ) / tensor.exp(2 * prior_log_sigma)
        - 0.5
    ).sum(axis=1)
    kl_term.name = 'kl_term'
    kl_term_mean = kl_term.mean()
    kl_term_mean.name = 'avg_kl_term'
    reconstruction_term = - (
        x * tensor.nnet.softplus(-mu_theta)
        + (1 - x) * tensor.nnet.softplus(mu_theta)).sum(axis=1)
    reconstruction_term.name = 'reconstruction_term'
    reconstruction_term_mean = -reconstruction_term.mean()
    reconstruction_term_mean.name = 'avg_reconstruction_term'
    cost = -(reconstruction_term - kl_term).mean()
    cost.name = 'nll_upper_bound'

    # Datasets and data streams
    mnist_train = MNIST(
        'train', start=0, stop=50000, binary=True, sources=('features',))
    train_loop_stream = DataStream(
        dataset=mnist_train,
        iteration_scheme=SequentialScheme(mnist_train.num_examples, 100))
    train_monitor_stream = DataStream(
        dataset=mnist_train,
        iteration_scheme=SequentialScheme(mnist_train.num_examples, 500))
    mnist_valid = MNIST(
        'train', start=50000, stop=60000, binary=True, sources=('features',))
    valid_monitor_stream = DataStream(
        dataset=mnist_valid,
        iteration_scheme=SequentialScheme(mnist_valid.num_examples, 500))
    mnist_test = MNIST('test', binary=True, sources=('features',))
    test_monitor_stream = DataStream(
        dataset=mnist_test,
        iteration_scheme=SequentialScheme(mnist_test.num_examples, 500))

    # Get parameters
    computation_graph = ComputationGraph([cost])
    params = VariableFilter(roles=[PARAMETER])(computation_graph.variables)

    # Training loop
    step_rule = RMSProp(learning_rate=1e-3, decay_rate=0.95)
    algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule)
    monitored_quantities = [cost, reconstruction_term_mean, kl_term_mean]
    main_loop = MainLoop(
        model=None, data_stream=train_loop_stream, algorithm=algorithm,
        extensions=[
            Timing(),
            FinishAfter(after_n_epochs=200),
            DataStreamMonitoring(
                monitored_quantities, train_monitor_stream, prefix="train"),
            DataStreamMonitoring(
                monitored_quantities, valid_monitor_stream, prefix="valid"),
            DataStreamMonitoring(
                monitored_quantities, test_monitor_stream, prefix="test"),
            Printing()])
    main_loop.run()
Exemplo n.º 37
0
def main():
    x = T.tensor3('features')
    m = T.matrix('features_mask')
    y = T.imatrix('targets')
    x = m.mean() + x #stupid mask not always needed...

    #embedding_size = 300
    #glove_version = "glove.6B.300d.txt"

    embedding_size = 50
    glove_version = "vectors.6B.50d.txt"
    wstd = 0.02

    conv1 = Conv1D(filter_length=5, num_filters=128, input_dim=embedding_size,
            weights_init=IsotropicGaussian(std=wstd),
            biases_init=Constant(0.0))
    conv1.initialize()
    o = conv1.apply(x)
    o = Rectifier(name="conv1red").apply(o)
    o = MaxPooling1D(pooling_length=5
            #, step=2
            ).apply(o)

    conv2 = Conv1D(filter_length=5, num_filters=128, input_dim=128,
            weights_init=IsotropicGaussian(std=wstd),
            biases_init=Constant(0.0),
            step=3,
            name="conv2")
    conv2.initialize()
    o = conv2.apply(o)

    o = Rectifier(name="conv2rec").apply(o)
    conv2 = Conv1D(filter_length=5, num_filters=128, input_dim=128,
            weights_init=IsotropicGaussian(std=wstd),
            biases_init=Constant(0.0),
            step=3,
            name="conv3")
    conv2.initialize()
    o = conv2.apply(o)
    o = Rectifier(name="conv3rec").apply(o)

    fork = Fork(weights_init=IsotropicGaussian(0.02),
            biases_init=Constant(0.),
            input_dim=128,
            output_dims=[128]*3,
            output_names=['inputs', 'reset_inputs', 'update_inputs']
            )
    fork.initialize()

    inputs, reset_inputs, update_inputs = fork.apply(o)

    out = o.mean(axis=1)

    #gru = GatedRecurrent(dim=128,
            #weights_init=IsotropicGaussian(0.02),
            #biases_init=IsotropicGaussian(0.0))

    #gru.initialize()
    #states = gru.apply(inputs=inputs, reset_inputs=reset_inputs, update_inputs=update_inputs)

    #out = states[:, -1, :]

    hidden = Linear(
        input_dim = 128,
        output_dim = 128,
        weights_init = Uniform(std=0.01),
        biases_init = Constant(0.))
    hidden.initialize()

    o = hidden.apply(out)
    o = Rectifier().apply(o)
    #hidden = Linear(
        #input_dim = 128,
        #output_dim = 128,
        #weights_init = IsotropicGaussian(std=0.02),
        #biases_init = Constant(0.),
        #name="hiddenmap2")
    #hidden.initialize()

    #o = hidden.apply(o)
    #o = Rectifier(name="rec2").apply(o)


    score_layer = Linear(
            input_dim = 128,
            output_dim = 1,
            weights_init = IsotropicGaussian(std=wstd),
            biases_init = Constant(0.),
            name="linear2")
    score_layer.initialize()
    o = score_layer.apply(o)

    probs = Sigmoid().apply(o)

    cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean()
    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    #print (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1).shape.eval(
            #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).sum(axis=1).shape.eval({
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).shape.eval({
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #raw_input()


    # =================

    cg = ComputationGraph([cost])
    params = cg.parameters

    algorithm = GradientDescent(
            cost = cost,
            params=params,
            step_rule = CompositeRule([
                StepClipping(threshold=10),
                AdaM(),
                #AdaDelta(),
                ])

            )


    # ========
    print "setting up data"
    ports = {
            'gpu0_train' : 5557,
            'gpu0_test' : 5558,
            'gpu1_train' : 5559,
            'gpu1_test' : 5560,
            }

    batch_size = 16
    def start_server(port, which_set):
        fuel.server.logger.setLevel('WARN')

        dataset = IMDBText(which_set)
        n_train = dataset.num_examples
        stream = DataStream(
                dataset=dataset,
                iteration_scheme=ShuffledScheme(
                    examples=n_train,
                    batch_size=batch_size)
                )
        print "loading glove"
        glove = GloveTransformer(glove_version, data_stream=stream)
        padded = Padding(
                data_stream=glove,
                mask_sources=('features',)
                )

        fuel.server.start_server(padded, port=port, hwm=20)

    train_port = ports[theano.config.device + '_train']
    train_p = Process(target=start_server, args=(train_port, 'train'))
    train_p.start()

    test_port = ports[theano.config.device + '_test']
    test_p = Process(target=start_server, args=(test_port, 'test'))
    test_p.start()

    train_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=train_port)
    test_stream = ServerDataStream(('features', 'features_mask', 'targets'), port=test_port)

    print "setting up model"
    #import ipdb
    #ipdb.set_trace()

    n_examples = 25000
    #======
    model = Model(cost)
    extensions = []
    extensions.append(EpochProgress(batch_per_epoch=n_examples // batch_size + 1))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification],
        prefix='train',
        after_epoch=True
        ))

    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        data_stream=test_stream,
        prefix='test',
        after_epoch=True
        ))
    extensions.append(Timing())
    extensions.append(Printing())

    #extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True))
    extensions.append(Plot(theano.config.device+"_result", channels=[['test_misclassification', 'train_misclassification']], after_epoch=True))

    main_loop = MainLoop(
            model=model,
            data_stream=train_stream,
            algorithm=algorithm,
            extensions=extensions)
    main_loop.run()
Exemplo n.º 38
0
class DRAW(BaseRecurrent, Initializable, Random):
    def __init__(self, nvis, nhid, encoding_mlp, encoding_lstm, decoding_mlp,
                 decoding_lstm, T=1, **kwargs):
        super(DRAW, self).__init__(**kwargs)

        self.nvis = nvis
        self.nhid = nhid
        self.T = T

        self.encoding_mlp = encoding_mlp
        self.encoding_mlp.name = 'encoder_mlp'
        for i, child in enumerate(self.encoding_mlp.children):
            child.name = '{}_{}'.format(self.encoding_mlp.name, i)
        self.encoding_lstm = encoding_lstm
        self.encoding_lstm.name = 'encoder_lstm'
        self.encoding_parameter_mapping = Fork(
            output_names=['mu_phi', 'log_sigma_phi'], prototype=Linear())

        self.decoding_mlp = decoding_mlp
        self.decoding_mlp.name = 'decoder_mlp'
        for i, child in enumerate(self.decoding_mlp.children):
            child.name = '{}_{}'.format(self.decoding_mlp.name, i)
        self.decoding_lstm = decoding_lstm
        self.decoding_lstm.name = 'decoder_lstm'
        self.decoding_parameter_mapping = Linear(name='mu_theta')

        self.prior_mu = tensor.zeros((self.nhid,))
        self.prior_mu.name = 'prior_mu'
        self.prior_log_sigma = tensor.zeros((self.nhid,))
        self.prior_log_sigma.name = 'prior_log_sigma'

        self.children = [self.encoding_mlp, self.encoding_lstm,
                         self.encoding_parameter_mapping,
                         self.decoding_mlp, self.decoding_lstm,
                         self.decoding_parameter_mapping]

    def _push_allocation_config(self):
        # The attention-less read operation concatenates x and x_hat, and
        # we feed the decoder back into the encoder, which is why the input
        # to the encoding MLP is twice the size of x plus the size of the
        # decoding LSTM.
        self.encoding_mlp.dims[0] = 2 * self.nvis + self.decoding_lstm.dim
        self.encoding_mlp.dims[-1] = 4 * self.encoding_lstm.dim
        self.encoding_parameter_mapping.input_dim = self.encoding_lstm.dim
        self.encoding_parameter_mapping.output_dims = dict(
            mu_phi=self.nhid, log_sigma_phi=self.nhid)
        self.decoding_mlp.dims[0] = self.nhid
        self.decoding_mlp.dims[-1] = 4 * self.decoding_lstm.dim
        self.decoding_parameter_mapping.input_dim = self.decoding_lstm.dim
        self.decoding_parameter_mapping.output_dim = self.nvis

    def sample(self, num_samples):
        z = self.theano_rng.normal(size=(self.T, num_samples, self.nhid),
                                   avg=self.prior_mu,
                                   std=tensor.exp(self.prior_log_sigma))
        return tensor.nnet.sigmoid(self.decode_z(z)[0][-1])

    @application(inputs=['x'], outputs=['x_hat'])
    def reconstruct(self, x):
        x_sequence = tensor.tile(x.dimshuffle('x', 0, 1), (self.T, 1, 1))
        rval = self.apply(x_sequence)
        return tensor.nnet.sigmoid(rval[0][-1])

    @recurrent(sequences=['z'], contexts=[],
               states=['c_states', 'decoding_states', 'decoding_cells'],
               outputs=['c_states', 'decoding_states', 'decoding_cells'])
    def decode_z(self, z, c_states=None, decoding_states=None,
                 decoding_cells=None):
        h_mlp_theta = self.decoding_mlp.apply(z)
        h_lstm_theta, cells_theta = self.decoding_lstm.apply(
            inputs=h_mlp_theta, states=decoding_states, cells=decoding_cells,
            iterate=False)
        new_c_states = (
            c_states + self.decoding_parameter_mapping.apply(h_lstm_theta))

        return new_c_states, h_lstm_theta, cells_theta

    @recurrent(sequences=['x'], contexts=[],
               states=['c_states', 'encoding_states', 'encoding_cells',
                       'decoding_states', 'decoding_cells'],
               outputs=['c_states', 'encoding_states', 'encoding_cells',
                        'decoding_states', 'decoding_cells', 'mu_phi',
                        'log_sigma_phi'])
    def apply(self, x, c_states=None, encoding_states=None,
              encoding_cells=None, decoding_states=None, decoding_cells=None):
        x_hat = x - tensor.nnet.sigmoid(c_states)
        # Concatenate x and x_hat
        r = tensor.concatenate([x, x_hat], axis=1)
        # Concatenate r and h_dec
        h_mlp_phi = self.encoding_mlp.apply(
            tensor.concatenate([r, decoding_states], axis=1))
        h_lstm_phi, cells_phi = self.encoding_lstm.apply(
            inputs=h_mlp_phi, states=encoding_states, cells=encoding_cells,
            iterate=False)
        phi = self.encoding_parameter_mapping.apply(h_lstm_phi)
        mu_phi, log_sigma_phi = phi
        epsilon = self.theano_rng.normal(size=mu_phi.shape, dtype=mu_phi.dtype)
        epsilon.name = 'epsilon'
        z = mu_phi + epsilon * tensor.exp(log_sigma_phi)
        z.name = 'z'
        h_mlp_theta = self.decoding_mlp.apply(z)
        h_lstm_theta, cells_theta = self.decoding_lstm.apply(
            inputs=h_mlp_theta, states=decoding_states, cells=decoding_cells,
            iterate=False)
        new_c_states = (
            c_states + self.decoding_parameter_mapping.apply(h_lstm_theta))

        return (new_c_states, h_lstm_phi, cells_phi, h_lstm_theta, cells_theta,
                mu_phi, log_sigma_phi)

    @application(inputs=['x'], outputs=['log_likelihood_lower_bound'])
    def log_likelihood_lower_bound(self, x):
        x_sequence = tensor.tile(x.dimshuffle('x', 0, 1), (self.T, 1, 1))
        rval = self.apply(x_sequence)
        c_states, mu_phi, log_sigma_phi = rval[0], rval[-2], rval[-1]

        prior_mu = self.prior_mu.dimshuffle('x', 'x', 0)
        prior_log_sigma = self.prior_log_sigma.dimshuffle('x', 'x', 0)
        kl_term = (
            prior_log_sigma - log_sigma_phi +
            0.5 * (
                tensor.exp(2 * log_sigma_phi) + (mu_phi - prior_mu) ** 2
            ) / tensor.exp(2 * prior_log_sigma) - 0.5).sum(axis=2).sum(axis=0)
        kl_term.name = 'kl_term'

        reconstruction_term = - (
            x * tensor.nnet.softplus(-c_states[-1]) +
            (1 - x) * tensor.nnet.softplus(c_states[-1])).sum(axis=1)
        reconstruction_term.name = 'reconstruction_term'

        log_likelihood_lower_bound = reconstruction_term - kl_term
        log_likelihood_lower_bound.name = 'log_likelihood_lower_bound'

        annotation = Annotation()
        annotation.add_auxiliary_variable(kl_term, name='kl_term')
        annotation.add_auxiliary_variable(-reconstruction_term,
                                          name='reconstruction_term')
        add_annotation(log_likelihood_lower_bound, annotation)

        return log_likelihood_lower_bound

    def get_dim(self, name):
        if name is 'c_states':
            return self.nvis
        elif name is 'encoding_states':
            return self.encoding_lstm.get_dim('states')
        elif name is 'encoding_cells':
            return self.encoding_lstm.get_dim('cells')
        elif name is 'decoding_states':
            return self.decoding_lstm.get_dim('states')
        elif name is 'decoding_cells':
            return self.decoding_lstm.get_dim('cells')
        else:
            return super(DRAW, self).get_dim(name)
Exemplo n.º 39
0
class AddParameters(Brick):
    """Adds dependency on parameters to a transition function.

    In fact an improved version of this brick should be moved
    to the main body of the library, because it is clearly reusable
    (e.g. it can be a part of Encoder-Decoder translation model.

    """
    @lazy
    def __init__(self, transition, num_params, params_name,
                 weights_init, biases_init, **kwargs):
        super(AddParameters, self).__init__(**kwargs)
        update_instance(self, locals())

        self.input_names = [name for name in transition.apply.sequences
                            if name != 'mask']
        self.state_name = transition.apply.states[0]
        assert len(transition.apply.states) == 1

        self.fork = Fork(self.input_names)
        # Could be also several init bricks, one for each of the states
        self.init = MLP([Identity()], name="init")
        self.children = [self.transition, self.fork, self.init]

    def _push_allocation_config(self):
        self.fork.input_dim = self.num_params
        self.fork.fork_dims = {name: self.transition.get_dim(name)
                               for name in self.input_names}
        self.init.dims[0] = self.num_params
        self.init.dims[-1] = self.transition.get_dim(self.state_name)

    def _push_initialization_config(self):
        for child in self.children:
            if self.weights_init:
                child.weights_init = self.weights_init
            if self.biases_init:
                child.biases_init = self.biases_init

    @application
    def apply(self, **kwargs):
        inputs = {name: kwargs.pop(name) for name in self.input_names}
        params = kwargs.pop("params")
        forks = self.fork.apply(params, return_dict=True)
        for name in self.input_names:
            inputs[name] = inputs[name] + forks[name]
        kwargs.update(inputs)
        if kwargs.get('iterate', True):
            kwargs[self.state_name] = self.initial_state(None, params=params)
        return self.transition.apply(**kwargs)

    @apply.delegate
    def apply_delegate(self):
        return self.transition.apply

    @apply.property('contexts')
    def apply_contexts(self):
        return [self.params_name] + self.transition.apply.contexts

    @application
    def initial_state(self, batch_size, *args, **kwargs):
        return self.init.apply(kwargs['params'])

    def get_dim(self, name):
        if name == 'params':
            return self.num_params
        return self.transition.get_dim(name)
def get_prernn(args):

    # time x batch
    x_mask = tensor.fmatrix('mask')

    # Compute the state dim
    if args.rnn_type == 'lstm':
        state_dim = 4 * args.state_dim
    else:
        state_dim = args.state_dim

    # Prepare the arguments for the fork
    output_names = []
    output_dims = []
    for d in range(args.layers):
        if d > 0:
            suffix = RECURRENTSTACK_SEPARATOR + str(d)
        else:
            suffix = ''
        if d == 0 or args.skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    # Prepare the brick to be forked (LookupTable or Linear)
    # Check if the dataset provides indices (in the case of a
    # fixed vocabulary, x is 2D tensor) or if it gives raw values
    # (x is 3D tensor)
    if has_indices(args.dataset):
        features = args.mini_batch_size
        x = tensor.lmatrix('features')
        vocab_size = get_output_size(args.dataset)
        lookup = LookupTable(length=vocab_size, dim=state_dim)
        lookup.weights_init = initialization.IsotropicGaussian(0.1)
        lookup.biases_init = initialization.Constant(0)
        forked = FeedforwardSequence([lookup.apply])
        if not has_mask(args.dataset):
            x_mask = tensor.ones_like(x, dtype=floatX)

    else:
        x = tensor.tensor3('features', dtype=floatX)
        if args.used_inputs is not None:
            x = tensor.set_subtensor(
                x[args.used_inputs:, :, :],
                tensor.zeros_like(x[args.used_inputs:, :, :], dtype=floatX))
        features = get_output_size(args.dataset)
        forked = Linear(input_dim=features, output_dim=state_dim)
        forked.weights_init = initialization.IsotropicGaussian(0.1)
        forked.biases_init = initialization.Constant(0)

        if not has_mask(args.dataset):
            x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX)

    # Define the fork
    fork = Fork(output_names=output_names,
                input_dim=features,
                output_dims=output_dims,
                prototype=forked)
    fork.initialize()

    # Apply the fork
    prernn = fork.apply(x)

    # Give a name to the input of each layer
    if args.skip_connections:
        for t in range(len(prernn)):
            prernn[t].name = "pre_rnn_" + str(t)
    else:
        prernn.name = "pre_rnn"

    return prernn, x_mask
Exemplo n.º 41
0
class Decoder(Initializable):
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        readout = Readout(
            source_names=['states', 'feedback', 'readout_context'],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(),
            feedback_brick=LookupFeedback(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence(
                [Bias(dim=1000).apply,
                 Maxout(num_pieces=2).apply,
                 Linear(input_dim=state_dim / 2, output_dim=100,
                        use_bias=False).apply,
                 Linear(input_dim=100).apply]),
            merged_dim=1000)

        self.transition = GatedRecurrentWithContext(Tanh(), dim=state_dim,
                                                    name='decoder')
        # Readout will apply the linear transformation to 'readout_context'
        # with a Merge brick, so no need to fork it here
        self.fork = Fork([name for name in
                          self.transition.apply.contexts +
                          self.transition.apply.states
                          if name != 'readout_context'], prototype=Linear())
        self.tanh = Tanh()

        self.sequence_generator = SequenceGenerator(
            readout=readout, transition=self.transition,
            fork_inputs=[name for name in self.transition.apply.sequences
                         if name != 'mask'],
        )

        self.children = [self.fork, self.sequence_generator, self.tanh]

    def _push_allocation_config(self):
        self.fork.input_dim = self.representation_dim
        self.fork.output_dims = [self.state_dim
                                 for _ in self.fork.output_names]

    @application(inputs=['representation', 'target_sentence_mask',
                         'target_sentence'], outputs=['cost'])
    def cost(self, representation, target_sentence, target_sentence_mask):
        target_sentence = target_sentence.dimshuffle(1, 0)
        target_sentence_mask = target_sentence_mask.T

        # The initial state and contexts, all functions of the representation
        contexts = {key: value.dimshuffle('x', 0, 1)
                    if key not in self.transition.apply.states else value
                    for key, value
                    in self.fork.apply(representation, as_dict=True).items()}
        contexts['states'] = self.tanh.apply(contexts['states'])
        cost = self.sequence_generator.cost(**merge(
            contexts, {'mask': target_sentence_mask,
                       'outputs': target_sentence,
                       'readout_context': representation.dimshuffle('x', 0, 1)}
        ))

        return (cost * target_sentence_mask).sum() / target_sentence_mask.shape[1]
Exemplo n.º 42
0
class NoLookupEncoder(Initializable):
    """This is a variation of ``BidirectionalEncoder`` which works with
    sparse feature maps. It does not use a lookup table but directly 
    feeds the predefined distributed representations into the encoder
    network."""
    def __init__(self, embedding_dim, state_dim, **kwargs):
        """Constructor. Note that this implementation only supports
        single layer architectures.
        
        Args:
            embedding_dim (int): Dimensionality of the word vectors
                                 defined by the sparse feature map.
            state_dim (int): Size of the recurrent layer.
        """
        super(NoLookupEncoder, self).__init__(**kwargs)
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.bidir = BidirectionalWMT15(
            GatedRecurrent(activation=Tanh(), dim=state_dim))
        self.fwd_fork = Fork([
            name
            for name in self.bidir.prototype.apply.sequences if name != 'mask'
        ],
                             prototype=Linear(),
                             name='fwd_fork')
        self.back_fork = Fork([
            name
            for name in self.bidir.prototype.apply.sequences if name != 'mask'
        ],
                              prototype=Linear(),
                              name='back_fork')
        self.children = [self.bidir, self.fwd_fork, self.back_fork]

    def _push_allocation_config(self):
        """Sets the dimensions of the forward and backward forks. """
        self.fwd_fork.input_dim = self.embedding_dim
        self.fwd_fork.output_dims = [
            self.bidir.children[0].get_dim(name)
            for name in self.fwd_fork.output_names
        ]
        self.back_fork.input_dim = self.embedding_dim
        self.back_fork.output_dims = [
            self.bidir.children[1].get_dim(name)
            for name in self.back_fork.output_names
        ]

    @application(inputs=['source_sentence', 'source_sentence_mask'],
                 outputs=['representation', 'representation_mask'])
    def apply(self, source_sentence, source_sentence_mask):
        """Creates bidirectional RNN source annotations.
        
        Args:
            source_sentence (Variable): Source sentence with words in
                                        vector representation.
            source_sentence_mask (Variable): Source mask
        
        Returns:
            Variable. source annotations
        """
        # Time as first dimension
        source_sentence = source_sentence.T
        source_sentence_mask = source_sentence_mask.T

        representation = self.bidir.apply(
            merge(self.fwd_fork.apply(source_sentence, as_dict=True),
                  {'mask': source_sentence_mask}),
            merge(self.back_fork.apply(source_sentence, as_dict=True),
                  {'mask': source_sentence_mask}))
        return representation, source_sentence_mask
Exemplo n.º 43
0
class TargetWordEncoder(Initializable):
    """Word encoder in target side use a single RNN to map a charater-level word to a vector"""
    def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth,
                 **kwargs):
        super(TargetWordEncoder, self).__init__(**kwargs)

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.dgru_state_dim = dgru_state_dim
        self.embedding_dim = embedding_dim
        self.lookup = LookupTable(name='embeddings')
        self.dgru_depth = dgru_depth
        self.dgru = RecurrentStack([
            DGRU(activation=Tanh(), dim=self.dgru_state_dim)
            for _ in range(dgru_depth)
        ],
                                   skip_connections=True)

        self.gru_fork = Fork(
            [name for name in self.dgru.apply.sequences if name != 'mask'],
            prototype=Linear(),
            name='gru_fork')

        self.children = [self.lookup, self.dgru, self.gru_fork]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        self.gru_fork.input_dim = self.embedding_dim
        self.gru_fork.output_dims = [
            self.dgru.get_dim(name) for name in self.gru_fork.output_names
        ]

    @application(inputs=['char_seq', 'sample_matrix', 'char_aux'],
                 outputs=['representation'])
    def apply(self, char_seq, sample_matrix, char_aux):
        # Time as first dimension
        embeddings = self.lookup.apply(char_seq)
        gru_out = self.dgru.apply(**merge(
            self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux}))
        if self.dgru_depth > 1:
            gru_out = gru_out[-1]
        sampled_representation = tensor.batched_dot(
            sample_matrix, gru_out.dimshuffle([1, 0, 2]))
        return sampled_representation.dimshuffle([1, 0, 2])

    @application(inputs=['target_single_char'])
    def single_emit(self, target_single_char, batch_size, mask, states=None):
        # Time as first dimension
        # only one batch
        embeddings = self.lookup.apply(target_single_char)
        if states is None:
            states = self.dgru.initial_states(batch_size)
        states_dict = {'states': states[0]}
        for i in range(1, self.dgru_depth):
            states_dict['states' + RECURRENTSTACK_SEPARATOR +
                        str(i)] = states[i]
        gru_out = self.dgru.apply(**merge(
            self.gru_fork.apply(embeddings, as_dict=True), states_dict, {
                'mask': mask,
                'iterate': False
            }))
        return gru_out

    @single_emit.property('outputs')
    def single_emit_outputs(self):
        return [
            'gru_out' + RECURRENTSTACK_SEPARATOR + str(i)
            for i in range(self.dgru_depth)
        ]

    def get_dim(self, name):
        if name in ['output', 'feedback']:
            return self.dgru_state_dim
        super(TargetWordEncoder, self).get_dim(name)
def get_prernn(args):

    # time x batch
    x_mask = tensor.fmatrix('mask')

    # Compute the state dim
    if args.rnn_type == 'lstm':
        state_dim = 4 * args.state_dim
    else:
        state_dim = args.state_dim

    # Prepare the arguments for the fork
    output_names = []
    output_dims = []
    for d in range(args.layers):
        if d > 0:
            suffix = RECURRENTSTACK_SEPARATOR + str(d)
        else:
            suffix = ''
        if d == 0 or args.skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    # Prepare the brick to be forked (LookupTable or Linear)
    # Check if the dataset provides indices (in the case of a
    # fixed vocabulary, x is 2D tensor) or if it gives raw values
    # (x is 3D tensor)
    if has_indices(args.dataset):
        features = args.mini_batch_size
        x = tensor.lmatrix('features')
        vocab_size = get_output_size(args.dataset)
        lookup = LookupTable(length=vocab_size, dim=state_dim)
        lookup.weights_init = initialization.IsotropicGaussian(0.1)
        lookup.biases_init = initialization.Constant(0)
        forked = FeedforwardSequence([lookup.apply])
        if not has_mask(args.dataset):
            x_mask = tensor.ones_like(x, dtype=floatX)

    else:
        x = tensor.tensor3('features', dtype=floatX)
        if args.used_inputs is not None:
            x = tensor.set_subtensor(x[args.used_inputs:, :, :],
                                     tensor.zeros_like(x[args.used_inputs:,
                                                         :, :],
                                                       dtype=floatX))
        features = get_output_size(args.dataset)
        forked = Linear(input_dim=features, output_dim=state_dim)
        forked.weights_init = initialization.IsotropicGaussian(0.1)
        forked.biases_init = initialization.Constant(0)

        if not has_mask(args.dataset):
            x_mask = tensor.ones_like(x[:, :, 0], dtype=floatX)

    # Define the fork
    fork = Fork(output_names=output_names, input_dim=features,
                output_dims=output_dims,
                prototype=forked)
    fork.initialize()

    # Apply the fork
    prernn = fork.apply(x)

    # Give a name to the input of each layer
    if args.skip_connections:
        for t in range(len(prernn)):
            prernn[t].name = "pre_rnn_" + str(t)
    else:
        prernn.name = "pre_rnn"

    return prernn, x_mask
Exemplo n.º 45
0
class Interpolator(AbstractReadout):
    """Readout char by char."""
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 igru_state_dim,
                 igru_depth,
                 trg_dgru_depth,
                 emitter,
                 feedback_brick,
                 merge=None,
                 merge_prototype=None,
                 post_merge=None,
                 **kwargs):
        merged_dim = igru_state_dim
        if not merge:
            merge = Merge(input_names=kwargs['source_names'],
                          prototype=merge_prototype)
        if not post_merge:
            post_merge = Bias(dim=merged_dim)

        # for compatible
        if igru_depth == 1:
            self.igru = IGRU(dim=igru_state_dim)
        else:
            self.igru = RecurrentStack(
                [IGRU(dim=igru_state_dim, name='igru')] + [
                    UpperIGRU(dim=igru_state_dim,
                              activation=Tanh(),
                              name='upper_igru' + str(i))
                    for i in range(1, igru_depth)
                ],
                skip_connections=True)
        self.embedding_dim = embedding_dim
        self.emitter = emitter
        self.feedback_brick = feedback_brick
        self.merge = merge
        self.post_merge = post_merge
        self.merged_dim = merged_dim
        self.igru_depth = igru_depth
        self.trg_dgru_depth = trg_dgru_depth
        self.lookup = LookupTable(name='embeddings')
        self.vocab_size = vocab_size
        self.igru_state_dim = igru_state_dim
        self.gru_to_softmax = Linear(input_dim=igru_state_dim,
                                     output_dim=vocab_size)
        self.gru_fork = Fork([
            name for name in self.igru.apply.sequences
            if name != 'mask' and name != 'input_states'
        ],
                             prototype=Linear(),
                             name='gru_fork')

        children = [
            self.emitter, self.feedback_brick, self.merge, self.post_merge,
            self.igru, self.lookup, self.gru_to_softmax, self.gru_fork
        ]
        kwargs.setdefault('children', []).extend(children)
        super(Interpolator, self).__init__(**kwargs)

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim
        self.emitter.readout_dim = self.get_dim('readouts')
        self.merge.input_names = self.source_names
        self.merge.input_dims = self.source_dims
        self.merge.output_dim = self.merged_dim
        self.post_merge.input_dim = self.merged_dim
        self.post_merge.output_dim = self.igru_state_dim
        self.gru_fork.input_dim = self.embedding_dim
        self.gru_fork.output_dims = [
            self.igru.get_dim(name) for name in self.gru_fork.output_names
        ]

    @application
    def initial_igru_outputs(self, batch_size):
        return self.igru.initial_states(batch_size)

    @application
    def emit(self, readouts):
        return self.emitter.emit(readouts)

    @application
    def cost(self, readouts, outputs):
        return self.emitter.cost(readouts, outputs)

    @application
    def initial_outputs(self, batch_size):
        return self.emitter.initial_outputs(batch_size)

    @application(outputs=['feedback'])
    def feedback(self, outputs):
        return self.feedback_brick.feedback(outputs)

    @application(outputs=['feedback'])
    def feedback_apply(self, target_char_seq, target_sample_matrix,
                       target_char_aux):
        return self.feedback_brick.apply(target_char_seq, target_sample_matrix,
                                         target_char_aux)

    @application
    def single_feedback(self,
                        target_single_char,
                        batch_size,
                        mask=None,
                        states=None):
        return self.feedback_brick.single_emit(target_single_char, batch_size,
                                               mask, states)

    @single_feedback.property('outputs')
    def single_feedback_outputs(self):
        return [
            'single_feedback' + RECURRENTSTACK_SEPARATOR + str(i)
            for i in range(self.trg_dgru_depth)
        ]

    @application(outputs=['gru_out', 'readout_chars'])
    def single_readout_gru(self, target_prev_char, target_prev_char_aux,
                           input_states, states):
        embeddings = self.lookup.apply(target_prev_char)
        states_dict = {'states': states[0]}
        if self.igru_depth > 1:
            for i in range(1, self.igru_depth):
                states_dict['states' + RECURRENTSTACK_SEPARATOR +
                            str(i)] = states[i]
        gru_out = self.igru.apply(**merge(
            self.gru_fork.apply(embeddings, as_dict=True), states_dict, {
                'mask': target_prev_char_aux,
                'input_states': input_states,
                'iterate': False
            }))
        if self.igru_depth > 1:
            readout_chars = self.gru_to_softmax.apply(gru_out[-1])
        else:
            readout_chars = self.gru_to_softmax.apply(gru_out)
        return gru_out, readout_chars

    @application
    def readout(self, **kwargs):
        merged = self.merge.apply(
            **{name: kwargs[name]
               for name in self.merge.input_names})
        merged = self.post_merge.apply(merged)
        return merged

    @application(outputs=['readout_chars'])
    def readout_gru(self, target_prev_char_seq, target_prev_char_aux,
                    input_states):
        embeddings = self.lookup.apply(target_prev_char_seq)
        gru_out = self.igru.apply(
            **merge(self.gru_fork.apply(embeddings, as_dict=True), {
                'mask': target_prev_char_aux,
                'input_states': input_states
            }))
        if self.igru_depth > 1:
            gru_out = gru_out[-1]
        readout_chars = self.gru_to_softmax.apply(gru_out)
        return readout_chars

    def get_dim(self, name):
        if name == 'outputs':
            return self.emitter.get_dim(name)
        elif name == 'feedback':
            return self.feedback_brick.get_dim(name)
        elif name == 'readouts':
            return self.readout_dim
        return super(AbstractReadout, self).get_dim(name)
Exemplo n.º 46
0
def build_model_lstm(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    virtual_dim = 4 * state_dim

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(virtual_dim)

    lookup = LookupTable(length=vocab_size, dim=virtual_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    # Make sure time_length is what we need
    fork = Fork(output_names=output_names,
                input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence([lookup.apply]))

    transitions = [
        LSTM(dim=state_dim, activation=Tanh()) for _ in range(layers)
    ]

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # If skip_connections: dim = layers * state_dim
    # else: dim = state_dim
    output_layer = Linear(input_dim=skip_connections * layers * state_dim +
                          (1 - skip_connections) * state_dim,
                          output_dim=vocab_size,
                          name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    init_cells = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs'] = pre_rnn
        init_states[d] = theano.shared(numpy.zeros(
            (args.mini_batch_size, state_dim)).astype(floatX),
                                       name='state0_%d' % d)
        init_cells[d] = theano.shared(numpy.zeros(
            (args.mini_batch_size, state_dim)).astype(floatX),
                                      name='cell0_%d' % d)
        kwargs['states' + suffix] = init_states[d]
        kwargs['cells' + suffix] = init_cells[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # h = [state, cell, in, forget, out, state_1,
    #        cell_1, in_1, forget_1, out_1 ...]

    last_states = {}
    last_cells = {}
    for d in range(layers):
        last_states[d] = h[5 * d][-1, :, :]
        last_cells[d] = h[5 * d + 1][-1, :, :]

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))
        updates.append((init_cells[d], last_states[d]))

    # h = [state, cell, in, forget, out, state_1,
    #        cell_1, in_1, forget_1, out_1 ...]

    # Extract the values
    in_gates = h[2::5]
    forget_gates = h[3::5]
    out_gates = h[4::5]

    gate_values = {
        "in_gates": in_gates,
        "forget_gates": forget_gates,
        "out_gates": out_gates
    }

    h = h[::5]

    # Now we have correctly:
    # h = [state, state_1, state_2 ...] if layers > 1
    # h = [state] if layers == 1

    # If we have skip connections, concatenate all the states
    # Else only consider the state of the highest layer
    if layers > 1:
        if skip_connections:
            h = tensor.concatenate(h, axis=2)
        else:
            h = h[-1]
    else:
        h = h[0]
    h.name = "hidden_state"

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(), presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    # Dont initialize as Orthogonal if we are about to load new parameters
    if args.load_path is not None:
        rnn.weights_init = initialization.Constant(0)
    else:
        rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates, gate_values
Exemplo n.º 47
0
def build_model_hard(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names,
                input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence([lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())]
    for i in range(layers - 1):
        mlp = MLP(activations=[Logistic()],
                  dims=[2 * state_dim, 1],
                  weights_init=initialization.IsotropicGaussian(0.1),
                  biases_init=initialization.Constant(0),
                  name="mlp_" + str(i))
        transitions.append(
            HardGatedRecurrent(dim=state_dim, mlp=mlp, activation=Tanh()))

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # dim = layers * state_dim
    output_layer = Linear(input_dim=layers * state_dim,
                          output_dim=vocab_size,
                          name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs' + suffix] = pre_rnn
        init_states[d] = theano.shared(numpy.zeros(
            (args.mini_batch_size, state_dim)).astype(floatX),
                                       name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # Now we have correctly:
    # h = [state_1, state_2, state_3 ...]

    # Save all the last states
    last_states = {}
    for d in range(layers):
        last_states[d] = h[d][-1, :, :]

    # Concatenate all the states
    if layers > 1:
        h = tensor.concatenate(h, axis=2)
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(), presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates
Exemplo n.º 48
0
class Parrot(Initializable, Random):
    def __init__(
            self,
            input_dim=420,  # Dimension of the text labels
            output_dim=63,  # Dimension of vocoder fram
            rnn_h_dim=1024,  # Size of rnn hidden state
            readouts_dim=1024,  # Size of readouts (summary of rnn)
            weak_feedback=False,  # Feedback to the top rnn layer
            full_feedback=False,  # Feedback to all rnn layers
            feedback_noise_level=None,  # Amount of noise in feedback
            layer_norm=False,  # Use simple normalization?
            use_speaker=False,  # Condition on the speaker id?
            num_speakers=21,  # How many speakers there are?
            speaker_dim=128,  # Size of speaker embedding
            which_cost='MSE',  # Train with MSE or GMM
            k_gmm=20,  # How many components in the GMM
            sampling_bias=0,  # Make samples more likely (Graves13)
            epsilon=1e-5,  # Numerical stabilities
            num_characters=43,  # how many chars in the labels
            attention_type='graves',  # graves or softmax
            attention_size=10,  # number of gaussians in the attention
            attention_alignment=1.,  # audio steps per letter at initialization
            sharpening_coeff=1.,
            timing_coeff=1.,
            encoder_type=None,
            encoder_dim=128,
            raw_output=False,
            **kwargs):

        super(Parrot, self).__init__(**kwargs)

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.rnn_h_dim = rnn_h_dim
        self.readouts_dim = readouts_dim
        self.layer_norm = layer_norm
        self.which_cost = which_cost
        self.use_speaker = use_speaker
        self.full_feedback = full_feedback
        self.feedback_noise_level = feedback_noise_level
        self.epsilon = epsilon

        self.num_characters = num_characters
        self.attention_type = attention_type
        self.attention_alignment = attention_alignment
        self.attention_size = attention_size
        self.sharpening_coeff = sharpening_coeff
        self.timing_coeff = timing_coeff

        self.encoder_type = encoder_type
        self.encoder_dim = encoder_dim

        self.encoded_input_dim = input_dim

        self.raw_output = raw_output

        if self.encoder_type == 'bidirectional':
            self.encoded_input_dim = 2 * encoder_dim

        if self.feedback_noise_level is not None:
            self.noise_level_var = tensor.scalar('feedback_noise_level')

        self.rnn1 = GatedRecurrent(dim=rnn_h_dim, name='rnn1')
        self.rnn2 = GatedRecurrent(dim=rnn_h_dim, name='rnn2')
        self.rnn3 = GatedRecurrent(dim=rnn_h_dim, name='rnn3')

        self.h1_to_readout = Linear(
            input_dim=rnn_h_dim,
            output_dim=readouts_dim,
            name='h1_to_readout')

        self.h2_to_readout = Linear(
            input_dim=rnn_h_dim,
            output_dim=readouts_dim,
            name='h2_to_readout')

        self.h3_to_readout = Linear(
            input_dim=rnn_h_dim,
            output_dim=readouts_dim,
            name='h3_to_readout')

        self.h1_to_h2 = Fork(
            output_names=['rnn2_inputs', 'rnn2_gates'],
            input_dim=rnn_h_dim,
            output_dims=[rnn_h_dim, 2 * rnn_h_dim],
            name='h1_to_h2')

        self.h1_to_h3 = Fork(
            output_names=['rnn3_inputs', 'rnn3_gates'],
            input_dim=rnn_h_dim,
            output_dims=[rnn_h_dim, 2 * rnn_h_dim],
            name='h1_to_h3')

        self.h2_to_h3 = Fork(
            output_names=['rnn3_inputs', 'rnn3_gates'],
            input_dim=rnn_h_dim,
            output_dims=[rnn_h_dim, 2 * rnn_h_dim],
            name='h2_to_h3')

        if which_cost == 'MSE':
            self.readout_to_output = Linear(
                input_dim=readouts_dim,
                output_dim=output_dim,
                name='readout_to_output')
        elif which_cost == 'GMM':
            self.sampling_bias = sampling_bias
            self.k_gmm = k_gmm
            self.readout_to_output = Fork(
                output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'],
                input_dim=readouts_dim,
                output_dims=[output_dim * k_gmm, output_dim * k_gmm, k_gmm],
                name='readout_to_output')

        self.encoder = Encoder(
            encoder_type,
            num_characters,
            input_dim,
            encoder_dim,
            name='encoder')

        self.children = [
            self.encoder,
            self.rnn1,
            self.rnn2,
            self.rnn3,
            self.h1_to_readout,
            self.h2_to_readout,
            self.h3_to_readout,
            self.h1_to_h2,
            self.h1_to_h3,
            self.h2_to_h3,
            self.readout_to_output]

        self.inp_to_h1 = Fork(
            output_names=['rnn1_inputs', 'rnn1_gates'],
            input_dim=self.encoded_input_dim,
            output_dims=[rnn_h_dim, 2 * rnn_h_dim],
            name='inp_to_h1')

        self.inp_to_h2 = Fork(
            output_names=['rnn2_inputs', 'rnn2_gates'],
            input_dim=self.encoded_input_dim,
            output_dims=[rnn_h_dim, 2 * rnn_h_dim],
            name='inp_to_h2')

        self.inp_to_h3 = Fork(
            output_names=['rnn3_inputs', 'rnn3_gates'],
            input_dim=self.encoded_input_dim,
            output_dims=[rnn_h_dim, 2 * rnn_h_dim],
            name='inp_to_h3')

        self.children += [
            self.inp_to_h1,
            self.inp_to_h2,
            self.inp_to_h3]

        self.h1_to_att = Fork(
            output_names=['alpha', 'beta', 'kappa'],
            input_dim=rnn_h_dim,
            output_dims=[attention_size] * 3,
            name='h1_to_att')

        self.att_to_readout = Linear(
            input_dim=self.encoded_input_dim,
            output_dim=readouts_dim,
            name='att_to_readout')

        self.children += [
            self.h1_to_att,
            self.att_to_readout]

        if use_speaker:
            self.num_speakers = num_speakers
            self.speaker_dim = speaker_dim
            self.embed_speaker = LookupTable(num_speakers, speaker_dim)

            self.speaker_to_h1 = Fork(
                output_names=['rnn1_inputs', 'rnn1_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h1')

            self.speaker_to_h2 = Fork(
                output_names=['rnn2_inputs', 'rnn2_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h2')

            self.speaker_to_h3 = Fork(
                output_names=['rnn3_inputs', 'rnn3_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h3')

            self.speaker_to_readout = Linear(
                input_dim=speaker_dim,
                output_dim=readouts_dim,
                name='speaker_to_readout')

            if which_cost == 'MSE':
                self.speaker_to_output = Linear(
                    input_dim=speaker_dim,
                    output_dim=output_dim,
                    name='speaker_to_output')
            elif which_cost == 'GMM':
                self.speaker_to_output = Fork(
                    output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'],
                    input_dim=speaker_dim,
                    output_dims=[
                        output_dim * k_gmm, output_dim * k_gmm, k_gmm],
                    name='speaker_to_output')

            self.children += [
                self.embed_speaker,
                self.speaker_to_h1,
                self.speaker_to_h2,
                self.speaker_to_h3,
                self.speaker_to_readout,
                self.speaker_to_output]

        if full_feedback:
            self.out_to_h2 = Fork(
                output_names=['rnn2_inputs', 'rnn2_gates'],
                input_dim=output_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='out_to_h2')

            self.out_to_h3 = Fork(
                output_names=['rnn3_inputs', 'rnn3_gates'],
                input_dim=output_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='out_to_h3')
            self.children += [
                self.out_to_h2,
                self.out_to_h3]
            weak_feedback = True

        self.weak_feedback = weak_feedback

        if weak_feedback:
            self.out_to_h1 = Fork(
                output_names=['rnn1_inputs', 'rnn1_gates'],
                input_dim=output_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='out_to_h1')
            self.children += [
                self.out_to_h1]

        if self.raw_output:
            self.sampleRnn = SampleRnn()
            self.children += [self.sampleRnn]

    def _allocate(self):
        self.initial_w = shared_floatx_zeros(
            (self.encoded_input_dim,), name="initial_w")

        add_role(self.initial_w, INITIAL_STATE)

    def symbolic_input_variables(self):
        features = tensor.tensor3('features')
        features_mask = tensor.matrix('features_mask')
        labels = tensor.imatrix('labels')
        labels_mask = tensor.matrix('labels_mask')

        start_flag = tensor.scalar('start_flag')

        if self.use_speaker:
            speaker = tensor.imatrix('speaker_index')
        else:
            speaker = None

        if self.raw_output:
            raw_sequence = tensor.itensor3('raw_audio')
        else:
            raw_sequence = None

        return features, features_mask, labels, labels_mask, \
            speaker, start_flag, raw_sequence

    def initial_states(self, batch_size):
        initial_h1 = self.rnn1.initial_states(batch_size)
        initial_h2 = self.rnn2.initial_states(batch_size)
        initial_h3 = self.rnn3.initial_states(batch_size)

        last_h1 = shared_floatx_zeros((batch_size, self.rnn_h_dim))
        last_h2 = shared_floatx_zeros((batch_size, self.rnn_h_dim))
        last_h3 = shared_floatx_zeros((batch_size, self.rnn_h_dim))

        # Defining for all
        initial_k = tensor.zeros(
            (batch_size, self.attention_size), dtype=floatX)
        last_k = shared_floatx_zeros((batch_size, self.attention_size))

        # Trainable initial state for w. Why not for k?
        initial_w = tensor.repeat(self.initial_w[None, :], batch_size, 0)

        last_w = shared_floatx_zeros((batch_size, self.encoded_input_dim))

        return initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \
            initial_w, last_w, initial_k, last_k

    @application
    def compute_cost(
            self, features, features_mask, labels, labels_mask,
            speaker, start_flag, batch_size, raw_audio=None):

        if speaker is None:
            assert not self.use_speaker

        target_features = features[1:]
        mask = features_mask[1:]

        cell_shape = (mask.shape[0], batch_size, self.rnn_h_dim)
        gat_shape = (mask.shape[0], batch_size, 2 * self.rnn_h_dim)
        cell_h1 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h2 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h3 = tensor.zeros(cell_shape, dtype=floatX)
        gat_h1 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h2 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h3 = tensor.zeros(gat_shape, dtype=floatX)

        if self.weak_feedback:
            input_features = features[:-1]

            if self.feedback_noise_level:
                noise = self.theano_rng.normal(
                    size=input_features.shape,
                    avg=0., std=1.)
                input_features += self.noise_level_var * noise

            out_cell_h1, out_gat_h1 = self.out_to_h1.apply(input_features)

            to_normalize = [
                out_cell_h1, out_gat_h1]
            out_cell_h1, out_gat_h1 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h1 += out_cell_h1
            gat_h1 += out_gat_h1

        if self.full_feedback:
            assert self.weak_feedback
            out_cell_h2, out_gat_h2 = self.out_to_h2.apply(input_features)
            out_cell_h3, out_gat_h3 = self.out_to_h3.apply(input_features)

            to_normalize = [
                out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3]
            out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h2 += out_cell_h2
            gat_h2 += out_gat_h2
            cell_h3 += out_cell_h3
            gat_h3 += out_gat_h3

        if self.use_speaker:
            speaker = speaker[:, 0]
            emb_speaker = self.embed_speaker.apply(speaker)
            emb_speaker = tensor.shape_padleft(emb_speaker)

            spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker)
            spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker)
            spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker)

            to_normalize = [
                spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2,
                spk_cell_h3, spk_gat_h3]

            spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \
                spk_cell_h3, spk_gat_h3, = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h1 = spk_cell_h1 + cell_h1
            cell_h2 = spk_cell_h2 + cell_h2
            cell_h3 = spk_cell_h3 + cell_h3
            gat_h1 = spk_gat_h1 + gat_h1
            gat_h2 = spk_gat_h2 + gat_h2
            gat_h3 = spk_gat_h3 + gat_h3

        initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \
            initial_w, last_w, initial_k, last_k = \
            self.initial_states(batch_size)

        # If it's a new example, use initial states.
        input_h1 = tensor.switch(
            start_flag, initial_h1, last_h1)
        input_h2 = tensor.switch(
            start_flag, initial_h2, last_h2)
        input_h3 = tensor.switch(
            start_flag, initial_h3, last_h3)
        input_w = tensor.switch(
            start_flag, initial_w, last_w)
        input_k = tensor.switch(
            start_flag, initial_k, last_k)

        context_oh = self.encoder.apply(labels) * \
            tensor.shape_padright(labels_mask)

        u = tensor.shape_padleft(
            tensor.arange(labels.shape[1], dtype=floatX), 2)

        def step(
                inp_h1_t, gat_h1_t, inp_h2_t, gat_h2_t, inp_h3_t, gat_h3_t,
                h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, context_oh):

            attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1)
            inp_h1_t += attinp_h1
            gat_h1_t += attgat_h1

            h1_t = self.rnn1.apply(
                inp_h1_t,
                gat_h1_t,
                h1_tm1, iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t) + self.epsilon
            else:
                a_t = tensor.exp(a_t) + self.epsilon

            b_t = tensor.exp(b_t) + self.epsilon
            k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t)

            a_t_ = a_t
            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) *
                    tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1)
            else:
                phi_t = tensor.sum(
                    a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1)

            attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t)
            attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t)
            inp_h2_t += attinp_h2
            gat_h2_t += attgat_h2
            inp_h3_t += attinp_h3
            gat_h3_t += attgat_h3

            h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t)
            h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t)

            to_normalize = [
                h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3]
            h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h2_t = self.rnn2.apply(
                inp_h2_t + h1inp_h2,
                gat_h2_t + h1gat_h2,
                h2_tm1, iterate=False)

            h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t)

            to_normalize = [
                h2inp_h3, h2gat_h3]
            h2inp_h3, h2gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h3_t = self.rnn3.apply(
                inp_h3_t + h1inp_h3 + h2inp_h3,
                gat_h3_t + h1gat_h3 + h2gat_h3,
                h3_tm1, iterate=False)

            return h1_t, h2_t, h3_t, k_t, w_t, phi_t, a_t_

        (h1, h2, h3, k, w, phi, pi_att), scan_updates = theano.scan(
            fn=step,
            sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3],
            non_sequences=[context_oh],
            outputs_info=[
                input_h1,
                input_h2,
                input_h3,
                input_k,
                input_w,
                None,
                None])

        h1_out = self.h1_to_readout.apply(h1)
        h2_out = self.h2_to_readout.apply(h2)
        h3_out = self.h3_to_readout.apply(h3)

        to_normalize = [
            h1_out, h2_out, h3_out]
        h1_out, h2_out, h3_out = \
            [_apply_norm(x, self.layer_norm) for x in to_normalize]

        readouts = h1_out + h2_out + h3_out

        if self.use_speaker:
            readouts += self.speaker_to_readout.apply(emb_speaker)

        readouts += self.att_to_readout.apply(w)

        predicted = self.readout_to_output.apply(readouts)

        if self.which_cost == 'MSE':
            if self.use_speaker:
                predicted += self.speaker_to_output.apply(emb_speaker)
            cost = tensor.sum((predicted - target_features) ** 2, axis=-1)

            next_x = predicted
            # Dummy value for coeff
            coeff = predicted
        elif self.which_cost == 'GMM':
            mu, sigma, coeff = predicted
            if self.use_speaker:
                spk_to_out = self.speaker_to_output.apply(emb_speaker)
                mu += spk_to_out[0]
                sigma += spk_to_out[1]
                coeff += spk_to_out[2]

            # When training there should not be sampling_bias
            sigma = tensor.exp(sigma) + self.epsilon

            coeff = tensor.nnet.softmax(
                coeff.reshape(
                    (-1, self.k_gmm))).reshape(
                        coeff.shape) + self.epsilon

            cost = cost_gmm(target_features, mu, sigma, coeff)
            next_x = sample_gmm(mu, sigma, coeff, self.theano_rng)

        cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag

        updates = []
        updates.append((last_h1, h1[-1]))
        updates.append((last_h2, h2[-1]))
        updates.append((last_h3, h3[-1]))
        updates.append((last_k, k[-1]))
        updates.append((last_w, w[-1]))

        cost_raw = None
        if self.raw_output:
            raw_mask = tensor.extra_ops.repeat(features_mask, 80, axis=0)
            raw_mask = raw_mask.dimshuffle(1, 0)

            # breakpointOp = PdbBreakpoint("Raw mask breakpoint")
            # condition = tensor.gt(raw_mask.shape[0], 0)
            # raw_mask = breakpointOp(condition, raw_mask)

            predicted_transposed = predicted.dimshuffle(1, 0, 2)

            last_h0, last_big_h0 = self.sampleRnn.initial_states(batch_size)
            raw_audio_reshaped = raw_audio.dimshuffle(1, 0, 2)
            raw_audio_reshaped = raw_audio_reshaped.reshape((raw_audio_reshaped.shape[0], -1))
            cost_raw, ip_cost, all_params, ip_params, other_params, new_h0, new_big_h0 =\
                self.sampleRnn.apply(raw_audio_reshaped, predicted_transposed, last_h0, last_big_h0, start_flag, raw_mask)

            if self.sampleRnn.N_RNN == 1:
                new_h0 = tensor.unbroadcast(new_h0, 1)
                new_big_h0 = tensor.unbroadcast(new_big_h0, 1)


            updates.append((last_h0, new_h0))
            updates.append((last_big_h0, new_big_h0))
            # cost = cost + 80.*cost_raw
            alpha_ = numpy.float32(0.)
            beta_ = numpy.float32(1.)
            cost = alpha_*cost + beta_*cost_raw

        attention_vars = [next_x, k, w, coeff, phi, pi_att]

        return cost, scan_updates + updates, attention_vars, cost_raw

    @application
    def sample_model_fun(
            self, labels, labels_mask, speaker, num_samples, seq_size):

        initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \
            initial_w, last_w, initial_k, last_k = \
            self.initial_states(num_samples)

        initial_x = numpy.zeros(
            (num_samples, self.output_dim), dtype=floatX)

        cell_shape = (seq_size, num_samples, self.rnn_h_dim)
        gat_shape = (seq_size, num_samples, 2 * self.rnn_h_dim)
        cell_h1 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h2 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h3 = tensor.zeros(cell_shape, dtype=floatX)
        gat_h1 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h2 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h3 = tensor.zeros(gat_shape, dtype=floatX)

        if self.use_speaker:
            speaker = speaker[:, 0]
            emb_speaker = self.embed_speaker.apply(speaker)

            # Applied before the broadcast.
            spk_readout = self.speaker_to_readout.apply(emb_speaker)
            spk_output = self.speaker_to_output.apply(emb_speaker)

            # Add dimension to repeat with time.
            emb_speaker = tensor.shape_padleft(emb_speaker)

            spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker)
            spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker)
            spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker)

            to_normalize = [
                spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2,
                spk_cell_h3, spk_gat_h3]

            spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \
                spk_cell_h3, spk_gat_h3, = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h1 += spk_cell_h1
            cell_h2 += spk_cell_h2
            cell_h3 += spk_cell_h3
            gat_h1 += spk_gat_h1
            gat_h2 += spk_gat_h2
            gat_h3 += spk_gat_h3

        context_oh = self.encoder.apply(labels) * \
            tensor.shape_padright(labels_mask)

        u = tensor.shape_padleft(
            tensor.arange(labels.shape[1], dtype=floatX), 2)

        def sample_step(
                inp_cell_h1_t, inp_gat_h1_t, inp_cell_h2_t, inp_gat_h2_t,
                inp_cell_h3_t, inp_gat_h3_t, x_tm1, h1_tm1, h2_tm1, h3_tm1,
                k_tm1, w_tm1):

            cell_h1_t = inp_cell_h1_t
            cell_h2_t = inp_cell_h2_t
            cell_h3_t = inp_cell_h3_t

            gat_h1_t = inp_gat_h1_t
            gat_h2_t = inp_gat_h2_t
            gat_h3_t = inp_gat_h3_t

            attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1)
            cell_h1_t += attinp_h1
            gat_h1_t += attgat_h1

            if self.weak_feedback:
                out_cell_h1_t, out_gat_h1_t = self.out_to_h1.apply(x_tm1)

                to_normalize = [
                    out_cell_h1_t, out_gat_h1_t]
                out_cell_h1_t, out_gat_h1_t = \
                    [_apply_norm(x, self.layer_norm) for x in to_normalize]

                cell_h1_t += out_cell_h1_t
                gat_h1_t += out_gat_h1_t

            if self.full_feedback:
                out_cell_h2_t, out_gat_h2_t = self.out_to_h2.apply(x_tm1)
                out_cell_h3_t, out_gat_h3_t = self.out_to_h3.apply(x_tm1)

                to_normalize = [
                    out_cell_h2_t, out_gat_h2_t,
                    out_cell_h3_t, out_gat_h3_t]
                out_cell_h2_t, out_gat_h2_t, \
                    out_cell_h3_t, out_gat_h3_t = \
                    [_apply_norm(x, self.layer_norm) for x in to_normalize]

                cell_h2_t += out_cell_h2_t
                cell_h3_t += out_cell_h3_t
                gat_h2_t += out_gat_h2_t
                gat_h3_t += out_gat_h3_t

            h1_t = self.rnn1.apply(
                cell_h1_t,
                gat_h1_t,
                h1_tm1, iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t) + self.epsilon
            else:
                a_t = tensor.exp(a_t) + self.epsilon

            b_t = tensor.exp(b_t) * self.sharpening_coeff + self.epsilon
            k_t = k_tm1 + self.attention_alignment * \
                tensor.exp(k_t) / self.timing_coeff

            a_t_ = a_t
            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) *
                    tensor.exp(-0.5 * b_t * (k_t_ - u)**2), axis=1)
            else:
                phi_t = tensor.sum(
                    a_t * tensor.exp(-b_t * (k_t_ - u)**2), axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1)

            attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t)
            attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t)
            cell_h2_t += attinp_h2
            gat_h2_t += attgat_h2
            cell_h3_t += attinp_h3
            gat_h3_t += attgat_h3

            h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t)
            h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t)

            to_normalize = [
                h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3]
            h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h2_t = self.rnn2.apply(
                cell_h2_t + h1inp_h2,
                gat_h2_t + h1gat_h2,
                h2_tm1, iterate=False)

            h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t)

            to_normalize = [
                h2inp_h3, h2gat_h3]
            h2inp_h3, h2gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h3_t = self.rnn3.apply(
                cell_h3_t + h1inp_h3 + h2inp_h3,
                gat_h3_t + h1gat_h3 + h2gat_h3,
                h3_tm1, iterate=False)

            h1_out_t = self.h1_to_readout.apply(h1_t)
            h2_out_t = self.h2_to_readout.apply(h2_t)
            h3_out_t = self.h3_to_readout.apply(h3_t)

            to_normalize = [
                h1_out_t, h2_out_t, h3_out_t]
            h1_out_t, h2_out_t, h3_out_t = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            readout_t = h1_out_t + h2_out_t + h3_out_t

            readout_t += self.att_to_readout.apply(w_t)

            if self.use_speaker:
                readout_t += spk_readout

            output_t = self.readout_to_output.apply(readout_t)

            if self.which_cost == 'MSE':
                predicted_x_t = output_t
                if self.use_speaker:
                    predicted_x_t += spk_output

                # Dummy value for coeff_t
                coeff_t = predicted_x_t
            elif self.which_cost == "GMM":
                mu_t, sigma_t, coeff_t = output_t
                if self.use_speaker:
                    mu_t += spk_output[0]
                    sigma_t += spk_output[1]
                    coeff_t += spk_output[2]

                sigma_t = tensor.exp(sigma_t - self.sampling_bias) + \
                    self.epsilon

                coeff_t = tensor.nnet.softmax(
                    coeff_t.reshape(
                        (-1, self.k_gmm)) * (1. + self.sampling_bias)).reshape(
                            coeff_t.shape) + self.epsilon

                predicted_x_t = sample_gmm(
                    mu_t, sigma_t, coeff_t, self.theano_rng)

            return predicted_x_t, h1_t, h2_t, h3_t, \
                k_t, w_t, coeff_t, phi_t, a_t_

        (sample_x, h1, h2, h3, k, w, pi, phi, pi_att), updates = theano.scan(
            fn=sample_step,
            sequences=[
                cell_h1,
                gat_h1,
                cell_h2,
                gat_h2,
                cell_h3,
                gat_h3],
            non_sequences=[],
            outputs_info=[
                initial_x,
                initial_h1,
                initial_h2,
                initial_h3,
                initial_k,
                initial_w,
                None,
                None,
                None])

        return sample_x, k, w, pi, phi, pi_att, updates

    def sample_model(
            self, labels_tr, labels_mask_tr, features_mask_tr,
            speaker_tr, num_samples, num_steps):

        features, features_mask, labels, labels_mask, speaker, start_flag, raw_sequence = \
            self.symbolic_input_variables()

        sample_x, k, w, pi, phi, pi_att, updates = \
            self.sample_model_fun(
                labels, labels_mask, speaker,
                num_samples, num_steps)

        theano_inputs = [labels, labels_mask]
        numpy_inputs = (labels_tr, labels_mask_tr)

        if self.use_speaker:
            theano_inputs += [speaker]
            numpy_inputs += (speaker_tr,)

        return function(
            theano_inputs,
            [sample_x, k, w, pi, phi, pi_att],
            updates=updates)(*numpy_inputs)

    def sample_using_input(self, data_tr, num_samples):
        # Used to predict the values using the dataset

        features, features_mask, labels, labels_mask, speaker, start_flag, raw_sequence = \
            self.symbolic_input_variables()

        cost, updates, attention_vars = self.compute_cost(
            features, features_mask, labels, labels_mask,
            speaker, start_flag, num_samples)
        sample_x, k, w, pi, phi, pi_att = attention_vars

        theano_vars = [
            features, features_mask, labels, labels_mask, speaker, start_flag]
        theano_vars = [x for x in theano_vars if x is not None]
        theano_vars = list(set(theano_vars))
        theano_vars = {x.name: x for x in theano_vars}

        theano_inputs = []
        numpy_inputs = []

        for key in data_tr.keys():
            theano_inputs.append(theano_vars[key])
            numpy_inputs.append(data_tr[key])

        return function(
            theano_inputs, [sample_x, k, w, pi, phi, pi_att],
            updates=updates)(*numpy_inputs)
Exemplo n.º 49
0
class BidirectionalPhonemeAudioEncoder(Initializable):

    def __init__(self, feature_size, embedding_dim, state_dim, **kwargs):
        super(BidirectionalPhonemeAudioEncoder, self).__init__(**kwargs)
        self.feature_size = feature_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim

        self.audio_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_embeddings")
        self.audio_fwd_fork = Fork(
            [name for name in self.audio_embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='audio_fwd_fork')
        self.audio_back_fork = Fork(
            [name for name in self.audio_embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='audio_back_fork')

        self.phoneme_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="phoneme_embeddings")
        self.phoneme_fwd_fork = Fork(
            [name for name in self.phoneme_embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='phoneme_fwd_fork')
        self.phoneme_back_fork = Fork(
            [name for name in self.phoneme_embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='phoneme_back_fork')

        self.words_embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="words_embeddings")
        self.words_fwd_fork = Fork(
            [name for name in self.words_embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='words_fwd_fork')
        self.words_back_fork = Fork(
            [name for name in self.words_embedding.prototype.apply.sequences
             if name != 'mask'], prototype=Linear(), name='words_back_fork')

        self.children = [self.phoneme_embedding, self.audio_embedding, self.words_embedding,
                         self.phoneme_fwd_fork, self.phoneme_back_fork, self.audio_fwd_fork, self.audio_back_fork, self.words_fwd_fork, self.words_back_fork]

    def _push_allocation_config(self):
        self.audio_fwd_fork.input_dim = self.feature_size
        self.audio_fwd_fork.output_dims = [self.audio_embedding.children[0].get_dim(name) for name in self.audio_fwd_fork.output_names]
        self.audio_back_fork.input_dim = self.feature_size
        self.audio_back_fork.output_dims = [self.audio_embedding.children[1].get_dim(name) for name in self.audio_back_fork.output_names]

        self.phoneme_fwd_fork.input_dim = 2 * self.embedding_dim
        self.phoneme_fwd_fork.output_dims = [self.phoneme_embedding.children[0].get_dim(name) for name in self.phoneme_fwd_fork.output_names]
        self.phoneme_back_fork.input_dim = 2 * self.embedding_dim
        self.phoneme_back_fork.output_dims = [self.phoneme_embedding.children[1].get_dim(name) for name in self.phoneme_back_fork.output_names]

        self.words_fwd_fork.input_dim = 2 * self.embedding_dim
        self.words_fwd_fork.output_dims = [self.words_embedding.children[0].get_dim(name) for name in self.words_fwd_fork.output_names]
        self.words_back_fork.input_dim = 2 * self.embedding_dim
        self.words_back_fork.output_dims = [self.words_embedding.children[1].get_dim(name) for name in self.words_back_fork.output_names]

    @application(inputs=['audio', 'audio_mask', 'phones_words_acoustic_ends', 'phones_words_acoustic_ends_mask', 'phoneme_words_ends', 'phoneme_words_ends_mask'],
                 outputs=['representation'])
    def apply(self, audio, audio_mask, phones_words_acoustic_ends, phones_words_acoustic_ends_mask, phoneme_words_ends, phoneme_words_ends_mask):
        batch_size = audio.shape[0]
        audio = audio.dimshuffle(1, 0, 2)
        audio_mask = audio_mask.dimshuffle(1, 0)

        audio_embeddings = self.audio_embedding.apply(
            merge(self.audio_fwd_fork.apply(audio, as_dict=True),
                  {'mask': audio_mask}),
            merge(self.audio_back_fork.apply(audio, as_dict=True),
                  {'mask': audio_mask})
        )

        rows = tensor.arange(batch_size).reshape((batch_size, 1))
        phoneme_embeddings = audio_embeddings.dimshuffle(1, 0, 2)[rows, phones_words_acoustic_ends].dimshuffle(1, 0, 2)

        phones_words_acoustic_ends_mask = phones_words_acoustic_ends_mask.dimshuffle(1, 0)
        words_embeddings = self.phoneme_embedding.apply(
            merge(self.phoneme_fwd_fork.apply(phoneme_embeddings, as_dict=True),
                  {'mask': phones_words_acoustic_ends_mask}),
            merge(self.phoneme_back_fork.apply(phoneme_embeddings, as_dict=True),
                  {'mask': phones_words_acoustic_ends_mask})
        )

        words_embeddings = words_embeddings.dimshuffle(1, 0, 2)[rows, phoneme_words_ends].dimshuffle(1, 0, 2)

        phoneme_words_ends_mask = phoneme_words_ends_mask.dimshuffle(1, 0)
        representation = self.words_embedding.apply(
            merge(self.words_fwd_fork.apply(phoneme_embeddings, as_dict=True),
                  {'mask': phoneme_words_ends_mask}),
            merge(self.words_back_fork.apply(phoneme_embeddings, as_dict=True),
                  {'mask': phoneme_words_ends_mask})
        )

        return representation
Exemplo n.º 50
0
class Decimator(Initializable):
    """Source word encoder, mapping a charater-level word to a vector.
        This encoder is able to learn the morphology.
        For compatibility with previous version, we call it Decimator.
    """
    def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth,
                 **kwargs):
        super(Decimator, self).__init__(**kwargs)

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.dgru_state_dim = dgru_state_dim
        self.embedding_dim = embedding_dim
        self.lookup = LookupTable(name='embeddings')
        self.dgru_depth = dgru_depth
        # representation
        self.dgru = RecurrentStack([
            DGRU(activation=Tanh(), dim=self.dgru_state_dim)
            for _ in range(dgru_depth)
        ],
                                   skip_connections=True)
        # importance of this representation
        self.bidir_w = Bidirectional(RecurrentWithFork(
            DGRU(activation=Tanh(), dim=self.dgru_state_dim // 2),
            self.embedding_dim,
            name='src_word_with_fork'),
                                     name='bidir_src_word_encoder')

        self.gru_fork = Fork(
            [name for name in self.dgru.apply.sequences if name != 'mask'],
            prototype=Linear(),
            name='gru_fork')
        # map to a energy scalar
        self.wl = Linear(input_dim=dgru_state_dim, output_dim=1)

        self.children = [
            self.lookup, self.dgru, self.gru_fork, self.bidir_w, self.wl
        ]

    def _push_allocation_config(self):
        self.lookup.length = self.vocab_size
        self.lookup.dim = self.embedding_dim

        self.gru_fork.input_dim = self.embedding_dim
        self.gru_fork.output_dims = [
            self.dgru.get_dim(name) for name in self.gru_fork.output_names
        ]

    @application(inputs=['char_seq', 'sample_matrix', 'char_aux'],
                 outputs=['representation', 'weight'])
    def apply(self, char_seq, sample_matrix, char_aux):
        # Time as first dimension
        embeddings = self.lookup.apply(char_seq)
        gru_out = self.dgru.apply(**merge(
            self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux}))
        wgru_out = tensor.exp(
            self.wl.apply(self.bidir_w.apply(embeddings, char_aux)))

        if self.dgru_depth > 1:
            gru_out = gru_out[-1]

        gru_out = tensor.addbroadcast(wgru_out, 2) * gru_out
        sampled_representation = tensor.tanh(
            tensor.batched_dot(sample_matrix, gru_out.dimshuffle([1, 0, 2])))
        return sampled_representation.dimshuffle([1, 0, 2]), wgru_out

    def get_dim(self, name):
        if name == 'output':
            return self.dgru_state_dim
        super(Decimator, self).get_dim(name)
Exemplo n.º 51
0
class Parrot(Initializable, Random):
    def __init__(
            self,
            input_dim=420,  # Dimension of the text labels
            output_dim=63,  # Dimension of vocoder fram
            rnn_h_dim=1024,  # Size of rnn hidden state
            readouts_dim=1024,  # Size of readouts (summary of rnn)
            weak_feedback=False,  # Feedback to the top rnn layer
            full_feedback=False,  # Feedback to all rnn layers
            feedback_noise_level=None,  # Amount of noise in feedback
            layer_norm=False,  # Use simple normalization?
            use_speaker=False,  # Condition on the speaker id?
            num_speakers=21,  # How many speakers there are?
            speaker_dim=128,  # Size of speaker embedding
            which_cost='MSE',  # Train with MSE or GMM
            k_gmm=20,  # How many components in the GMM
            sampling_bias=0,  # Make samples more likely (Graves13)
            epsilon=1e-5,  # Numerical stabilities
            num_characters=43,  # how many chars in the labels
            attention_type='graves',  # graves or softmax
            attention_size=10,  # number of gaussians in the attention
            attention_alignment=1.,  # audio steps per letter at initialization
            sharpening_coeff=1.,
            timing_coeff=1.,
            encoder_type=None,
            encoder_dim=128,
            **kwargs):

        super(Parrot, self).__init__(**kwargs)

        self.input_dim = input_dim
        self.output_dim = output_dim
        self.rnn_h_dim = rnn_h_dim
        self.readouts_dim = readouts_dim
        self.layer_norm = layer_norm
        self.which_cost = which_cost
        self.use_speaker = use_speaker
        self.full_feedback = full_feedback
        self.feedback_noise_level = feedback_noise_level
        self.epsilon = epsilon

        self.num_characters = num_characters
        self.attention_type = attention_type
        self.attention_alignment = attention_alignment
        self.attention_size = attention_size
        self.sharpening_coeff = sharpening_coeff
        self.timing_coeff = timing_coeff

        self.encoder_type = encoder_type
        self.encoder_dim = encoder_dim

        self.encoded_input_dim = input_dim

        if self.encoder_type == 'bidirectional':
            self.encoded_input_dim = 2 * encoder_dim

        if self.feedback_noise_level is not None:
            self.noise_level_var = tensor.scalar('feedback_noise_level')

        self.rnn1 = GatedRecurrent(dim=rnn_h_dim, name='rnn1')
        self.rnn2 = GatedRecurrent(dim=rnn_h_dim, name='rnn2')
        self.rnn3 = GatedRecurrent(dim=rnn_h_dim, name='rnn3')

        self.h1_to_readout = Linear(input_dim=rnn_h_dim,
                                    output_dim=readouts_dim,
                                    name='h1_to_readout')

        self.h2_to_readout = Linear(input_dim=rnn_h_dim,
                                    output_dim=readouts_dim,
                                    name='h2_to_readout')

        self.h3_to_readout = Linear(input_dim=rnn_h_dim,
                                    output_dim=readouts_dim,
                                    name='h3_to_readout')

        self.h1_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'],
                             input_dim=rnn_h_dim,
                             output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                             name='h1_to_h2')

        self.h1_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'],
                             input_dim=rnn_h_dim,
                             output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                             name='h1_to_h3')

        self.h2_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'],
                             input_dim=rnn_h_dim,
                             output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                             name='h2_to_h3')

        if which_cost == 'MSE':
            self.readout_to_output = Linear(input_dim=readouts_dim,
                                            output_dim=output_dim,
                                            name='readout_to_output')
        elif which_cost == 'GMM':
            self.sampling_bias = sampling_bias
            self.k_gmm = k_gmm
            self.readout_to_output = Fork(
                output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'],
                input_dim=readouts_dim,
                output_dims=[output_dim * k_gmm, output_dim * k_gmm, k_gmm],
                name='readout_to_output')

        self.encoder = Encoder(encoder_type,
                               num_characters,
                               input_dim,
                               encoder_dim,
                               name='encoder')

        self.children = [
            self.encoder, self.rnn1, self.rnn2, self.rnn3, self.h1_to_readout,
            self.h2_to_readout, self.h3_to_readout, self.h1_to_h2,
            self.h1_to_h3, self.h2_to_h3, self.readout_to_output
        ]

        self.inp_to_h1 = Fork(output_names=['rnn1_inputs', 'rnn1_gates'],
                              input_dim=self.encoded_input_dim,
                              output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                              name='inp_to_h1')

        self.inp_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'],
                              input_dim=self.encoded_input_dim,
                              output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                              name='inp_to_h2')

        self.inp_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'],
                              input_dim=self.encoded_input_dim,
                              output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                              name='inp_to_h3')

        self.children += [self.inp_to_h1, self.inp_to_h2, self.inp_to_h3]

        self.h1_to_att = Fork(output_names=['alpha', 'beta', 'kappa'],
                              input_dim=rnn_h_dim,
                              output_dims=[attention_size] * 3,
                              name='h1_to_att')

        self.att_to_readout = Linear(input_dim=self.encoded_input_dim,
                                     output_dim=readouts_dim,
                                     name='att_to_readout')

        self.children += [self.h1_to_att, self.att_to_readout]

        if use_speaker:
            self.num_speakers = num_speakers
            self.speaker_dim = speaker_dim
            self.embed_speaker = LookupTable(num_speakers, speaker_dim)

            self.speaker_to_h1 = Fork(
                output_names=['rnn1_inputs', 'rnn1_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h1')

            self.speaker_to_h2 = Fork(
                output_names=['rnn2_inputs', 'rnn2_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h2')

            self.speaker_to_h3 = Fork(
                output_names=['rnn3_inputs', 'rnn3_gates'],
                input_dim=speaker_dim,
                output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                name='speaker_to_h3')

            self.speaker_to_readout = Linear(input_dim=speaker_dim,
                                             output_dim=readouts_dim,
                                             name='speaker_to_readout')

            if which_cost == 'MSE':
                self.speaker_to_output = Linear(input_dim=speaker_dim,
                                                output_dim=output_dim,
                                                name='speaker_to_output')
            elif which_cost == 'GMM':
                self.speaker_to_output = Fork(
                    output_names=['gmm_mu', 'gmm_sigma', 'gmm_coeff'],
                    input_dim=speaker_dim,
                    output_dims=[
                        output_dim * k_gmm, output_dim * k_gmm, k_gmm
                    ],
                    name='speaker_to_output')

            self.children += [
                self.embed_speaker, self.speaker_to_h1, self.speaker_to_h2,
                self.speaker_to_h3, self.speaker_to_readout,
                self.speaker_to_output
            ]

        if full_feedback:
            self.out_to_h2 = Fork(output_names=['rnn2_inputs', 'rnn2_gates'],
                                  input_dim=output_dim,
                                  output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                                  name='out_to_h2')

            self.out_to_h3 = Fork(output_names=['rnn3_inputs', 'rnn3_gates'],
                                  input_dim=output_dim,
                                  output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                                  name='out_to_h3')
            self.children += [self.out_to_h2, self.out_to_h3]
            weak_feedback = True

        self.weak_feedback = weak_feedback

        if weak_feedback:
            self.out_to_h1 = Fork(output_names=['rnn1_inputs', 'rnn1_gates'],
                                  input_dim=output_dim,
                                  output_dims=[rnn_h_dim, 2 * rnn_h_dim],
                                  name='out_to_h1')
            self.children += [self.out_to_h1]

    def _allocate(self):
        self.initial_w = shared_floatx_zeros((self.encoded_input_dim, ),
                                             name="initial_w")

        add_role(self.initial_w, INITIAL_STATE)

    def symbolic_input_variables(self):
        features = tensor.tensor3('features')
        features_mask = tensor.matrix('features_mask')
        labels = tensor.imatrix('labels')
        labels_mask = tensor.matrix('labels_mask')

        start_flag = tensor.scalar('start_flag')

        if self.use_speaker:
            speaker = tensor.imatrix('speaker_index')
        else:
            speaker = None

        return features, features_mask, labels, labels_mask, \
            speaker, start_flag

    def initial_states(self, batch_size):
        initial_h1 = self.rnn1.initial_states(batch_size)
        initial_h2 = self.rnn2.initial_states(batch_size)
        initial_h3 = self.rnn3.initial_states(batch_size)

        last_h1 = shared_floatx_zeros((batch_size, self.rnn_h_dim))
        last_h2 = shared_floatx_zeros((batch_size, self.rnn_h_dim))
        last_h3 = shared_floatx_zeros((batch_size, self.rnn_h_dim))

        # Defining for all
        initial_k = tensor.zeros((batch_size, self.attention_size),
                                 dtype=floatX)
        last_k = shared_floatx_zeros((batch_size, self.attention_size))

        # Trainable initial state for w. Why not for k?
        initial_w = tensor.repeat(self.initial_w[None, :], batch_size, 0)

        last_w = shared_floatx_zeros((batch_size, self.encoded_input_dim))

        return initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \
            initial_w, last_w, initial_k, last_k

    @application
    def compute_cost(self, features, features_mask, labels, labels_mask,
                     speaker, start_flag, batch_size):

        if speaker is None:
            assert not self.use_speaker

        target_features = features[1:]
        mask = features_mask[1:]

        cell_shape = (mask.shape[0], batch_size, self.rnn_h_dim)
        gat_shape = (mask.shape[0], batch_size, 2 * self.rnn_h_dim)
        cell_h1 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h2 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h3 = tensor.zeros(cell_shape, dtype=floatX)
        gat_h1 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h2 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h3 = tensor.zeros(gat_shape, dtype=floatX)

        if self.weak_feedback:
            input_features = features[:-1]

            if self.feedback_noise_level:
                noise = self.theano_rng.normal(size=input_features.shape,
                                               avg=0.,
                                               std=1.)
                input_features += self.noise_level_var * noise

            out_cell_h1, out_gat_h1 = self.out_to_h1.apply(input_features)

            to_normalize = [out_cell_h1, out_gat_h1]
            out_cell_h1, out_gat_h1 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h1 += out_cell_h1
            gat_h1 += out_gat_h1

        if self.full_feedback:
            assert self.weak_feedback
            out_cell_h2, out_gat_h2 = self.out_to_h2.apply(input_features)
            out_cell_h3, out_gat_h3 = self.out_to_h3.apply(input_features)

            to_normalize = [out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3]
            out_cell_h2, out_gat_h2, out_cell_h3, out_gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h2 += out_cell_h2
            gat_h2 += out_gat_h2
            cell_h3 += out_cell_h3
            gat_h3 += out_gat_h3

        if self.use_speaker:
            speaker = speaker[:, 0]
            emb_speaker = self.embed_speaker.apply(speaker)
            emb_speaker = tensor.shape_padleft(emb_speaker)

            spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker)
            spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker)
            spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker)

            to_normalize = [
                spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3,
                spk_gat_h3
            ]

            spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \
                spk_cell_h3, spk_gat_h3, = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h1 = spk_cell_h1 + cell_h1
            cell_h2 = spk_cell_h2 + cell_h2
            cell_h3 = spk_cell_h3 + cell_h3
            gat_h1 = spk_gat_h1 + gat_h1
            gat_h2 = spk_gat_h2 + gat_h2
            gat_h3 = spk_gat_h3 + gat_h3

        initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \
            initial_w, last_w, initial_k, last_k = \
            self.initial_states(batch_size)

        # If it's a new example, use initial states.
        input_h1 = tensor.switch(start_flag, initial_h1, last_h1)
        input_h2 = tensor.switch(start_flag, initial_h2, last_h2)
        input_h3 = tensor.switch(start_flag, initial_h3, last_h3)
        input_w = tensor.switch(start_flag, initial_w, last_w)
        input_k = tensor.switch(start_flag, initial_k, last_k)

        context_oh = self.encoder.apply(labels) * \
            tensor.shape_padright(labels_mask)

        u = tensor.shape_padleft(tensor.arange(labels.shape[1], dtype=floatX),
                                 2)

        def step(inp_h1_t, gat_h1_t, inp_h2_t, gat_h2_t, inp_h3_t, gat_h3_t,
                 h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1, context_oh):

            attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1)
            inp_h1_t += attinp_h1
            gat_h1_t += attgat_h1

            h1_t = self.rnn1.apply(inp_h1_t, gat_h1_t, h1_tm1, iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t) + self.epsilon
            else:
                a_t = tensor.exp(a_t) + self.epsilon

            b_t = tensor.exp(b_t) + self.epsilon
            k_t = k_tm1 + self.attention_alignment * tensor.exp(k_t)

            a_t_ = a_t
            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t *
                                                        (k_t_ - u)**2),
                    axis=1)
            else:
                phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2),
                                   axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1)

            attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t)
            attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t)
            inp_h2_t += attinp_h2
            gat_h2_t += attgat_h2
            inp_h3_t += attinp_h3
            gat_h3_t += attgat_h3

            h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t)
            h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t)

            to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3]
            h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h2_t = self.rnn2.apply(inp_h2_t + h1inp_h2,
                                   gat_h2_t + h1gat_h2,
                                   h2_tm1,
                                   iterate=False)

            h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t)

            to_normalize = [h2inp_h3, h2gat_h3]
            h2inp_h3, h2gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h3_t = self.rnn3.apply(inp_h3_t + h1inp_h3 + h2inp_h3,
                                   gat_h3_t + h1gat_h3 + h2gat_h3,
                                   h3_tm1,
                                   iterate=False)

            return h1_t, h2_t, h3_t, k_t, w_t, phi_t, a_t_

        (h1, h2, h3, k, w, phi, pi_att), scan_updates = theano.scan(
            fn=step,
            sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3],
            non_sequences=[context_oh],
            outputs_info=[
                input_h1, input_h2, input_h3, input_k, input_w, None, None
            ])

        h1_out = self.h1_to_readout.apply(h1)
        h2_out = self.h2_to_readout.apply(h2)
        h3_out = self.h3_to_readout.apply(h3)

        to_normalize = [h1_out, h2_out, h3_out]
        h1_out, h2_out, h3_out = \
            [_apply_norm(x, self.layer_norm) for x in to_normalize]

        readouts = h1_out + h2_out + h3_out

        if self.use_speaker:
            readouts += self.speaker_to_readout.apply(emb_speaker)

        readouts += self.att_to_readout.apply(w)

        predicted = self.readout_to_output.apply(readouts)

        if self.which_cost == 'MSE':
            if self.use_speaker:
                predicted += self.speaker_to_output.apply(emb_speaker)
            cost = tensor.sum((predicted - target_features)**2, axis=-1)

            next_x = predicted
            # Dummy value for coeff
            coeff = predicted
        elif self.which_cost == 'GMM':
            mu, sigma, coeff = predicted
            if self.use_speaker:
                spk_to_out = self.speaker_to_output.apply(emb_speaker)
                mu += spk_to_out[0]
                sigma += spk_to_out[1]
                coeff += spk_to_out[2]

            # When training there should not be sampling_bias
            sigma = tensor.exp(sigma) + self.epsilon

            coeff = tensor.nnet.softmax(coeff.reshape(
                (-1, self.k_gmm))).reshape(coeff.shape) + self.epsilon

            cost = cost_gmm(target_features, mu, sigma, coeff)
            next_x = sample_gmm(mu, sigma, coeff, self.theano_rng)

        cost = (cost * mask).sum() / (mask.sum() + 1e-5) + 0. * start_flag

        updates = []
        updates.append((last_h1, h1[-1]))
        updates.append((last_h2, h2[-1]))
        updates.append((last_h3, h3[-1]))
        updates.append((last_k, k[-1]))
        updates.append((last_w, w[-1]))

        attention_vars = [next_x, k, w, coeff, phi, pi_att]

        return cost, scan_updates + updates, attention_vars

    @application
    def sample_model_fun(self, labels, labels_mask, speaker, num_samples,
                         seq_size):

        initial_h1, last_h1, initial_h2, last_h2, initial_h3, last_h3, \
            initial_w, last_w, initial_k, last_k = \
            self.initial_states(num_samples)

        initial_x = numpy.zeros((num_samples, self.output_dim), dtype=floatX)

        cell_shape = (seq_size, num_samples, self.rnn_h_dim)
        gat_shape = (seq_size, num_samples, 2 * self.rnn_h_dim)
        cell_h1 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h2 = tensor.zeros(cell_shape, dtype=floatX)
        cell_h3 = tensor.zeros(cell_shape, dtype=floatX)
        gat_h1 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h2 = tensor.zeros(gat_shape, dtype=floatX)
        gat_h3 = tensor.zeros(gat_shape, dtype=floatX)

        if self.use_speaker:
            speaker = speaker[:, 0]
            emb_speaker = self.embed_speaker.apply(speaker)

            # Applied before the broadcast.
            spk_readout = self.speaker_to_readout.apply(emb_speaker)
            spk_output = self.speaker_to_output.apply(emb_speaker)

            # Add dimension to repeat with time.
            emb_speaker = tensor.shape_padleft(emb_speaker)

            spk_cell_h1, spk_gat_h1 = self.speaker_to_h1.apply(emb_speaker)
            spk_cell_h2, spk_gat_h2 = self.speaker_to_h2.apply(emb_speaker)
            spk_cell_h3, spk_gat_h3 = self.speaker_to_h3.apply(emb_speaker)

            to_normalize = [
                spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, spk_cell_h3,
                spk_gat_h3
            ]

            spk_cell_h1, spk_gat_h1, spk_cell_h2, spk_gat_h2, \
                spk_cell_h3, spk_gat_h3, = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            cell_h1 += spk_cell_h1
            cell_h2 += spk_cell_h2
            cell_h3 += spk_cell_h3
            gat_h1 += spk_gat_h1
            gat_h2 += spk_gat_h2
            gat_h3 += spk_gat_h3

        context_oh = self.encoder.apply(labels) * \
            tensor.shape_padright(labels_mask)

        u = tensor.shape_padleft(tensor.arange(labels.shape[1], dtype=floatX),
                                 2)

        def sample_step(inp_cell_h1_t, inp_gat_h1_t, inp_cell_h2_t,
                        inp_gat_h2_t, inp_cell_h3_t, inp_gat_h3_t, x_tm1,
                        h1_tm1, h2_tm1, h3_tm1, k_tm1, w_tm1):

            cell_h1_t = inp_cell_h1_t
            cell_h2_t = inp_cell_h2_t
            cell_h3_t = inp_cell_h3_t

            gat_h1_t = inp_gat_h1_t
            gat_h2_t = inp_gat_h2_t
            gat_h3_t = inp_gat_h3_t

            attinp_h1, attgat_h1 = self.inp_to_h1.apply(w_tm1)
            cell_h1_t += attinp_h1
            gat_h1_t += attgat_h1

            if self.weak_feedback:
                out_cell_h1_t, out_gat_h1_t = self.out_to_h1.apply(x_tm1)

                to_normalize = [out_cell_h1_t, out_gat_h1_t]
                out_cell_h1_t, out_gat_h1_t = \
                    [_apply_norm(x, self.layer_norm) for x in to_normalize]

                cell_h1_t += out_cell_h1_t
                gat_h1_t += out_gat_h1_t

            if self.full_feedback:
                out_cell_h2_t, out_gat_h2_t = self.out_to_h2.apply(x_tm1)
                out_cell_h3_t, out_gat_h3_t = self.out_to_h3.apply(x_tm1)

                to_normalize = [
                    out_cell_h2_t, out_gat_h2_t, out_cell_h3_t, out_gat_h3_t
                ]
                out_cell_h2_t, out_gat_h2_t, \
                    out_cell_h3_t, out_gat_h3_t = \
                    [_apply_norm(x, self.layer_norm) for x in to_normalize]

                cell_h2_t += out_cell_h2_t
                cell_h3_t += out_cell_h3_t
                gat_h2_t += out_gat_h2_t
                gat_h3_t += out_gat_h3_t

            h1_t = self.rnn1.apply(cell_h1_t, gat_h1_t, h1_tm1, iterate=False)

            a_t, b_t, k_t = self.h1_to_att.apply(h1_t)

            if self.attention_type == "softmax":
                a_t = tensor.nnet.softmax(a_t) + self.epsilon
            else:
                a_t = tensor.exp(a_t) + self.epsilon

            b_t = tensor.exp(b_t) * self.sharpening_coeff + self.epsilon
            k_t = k_tm1 + self.attention_alignment * \
                tensor.exp(k_t) / self.timing_coeff

            a_t_ = a_t
            a_t = tensor.shape_padright(a_t)
            b_t = tensor.shape_padright(b_t)
            k_t_ = tensor.shape_padright(k_t)

            # batch size X att size X len context
            if self.attention_type == "softmax":
                # numpy.sqrt(1/(2*numpy.pi)) is the weird number
                phi_t = 0.3989422917366028 * tensor.sum(
                    a_t * tensor.sqrt(b_t) * tensor.exp(-0.5 * b_t *
                                                        (k_t_ - u)**2),
                    axis=1)
            else:
                phi_t = tensor.sum(a_t * tensor.exp(-b_t * (k_t_ - u)**2),
                                   axis=1)

            # batch size X len context X num letters
            w_t = (tensor.shape_padright(phi_t) * context_oh).sum(axis=1)

            attinp_h2, attgat_h2 = self.inp_to_h2.apply(w_t)
            attinp_h3, attgat_h3 = self.inp_to_h3.apply(w_t)
            cell_h2_t += attinp_h2
            gat_h2_t += attgat_h2
            cell_h3_t += attinp_h3
            gat_h3_t += attgat_h3

            h1inp_h2, h1gat_h2 = self.h1_to_h2.apply(h1_t)
            h1inp_h3, h1gat_h3 = self.h1_to_h3.apply(h1_t)

            to_normalize = [h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3]
            h1inp_h2, h1gat_h2, h1inp_h3, h1gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h2_t = self.rnn2.apply(cell_h2_t + h1inp_h2,
                                   gat_h2_t + h1gat_h2,
                                   h2_tm1,
                                   iterate=False)

            h2inp_h3, h2gat_h3 = self.h2_to_h3.apply(h2_t)

            to_normalize = [h2inp_h3, h2gat_h3]
            h2inp_h3, h2gat_h3 = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            h3_t = self.rnn3.apply(cell_h3_t + h1inp_h3 + h2inp_h3,
                                   gat_h3_t + h1gat_h3 + h2gat_h3,
                                   h3_tm1,
                                   iterate=False)

            h1_out_t = self.h1_to_readout.apply(h1_t)
            h2_out_t = self.h2_to_readout.apply(h2_t)
            h3_out_t = self.h3_to_readout.apply(h3_t)

            to_normalize = [h1_out_t, h2_out_t, h3_out_t]
            h1_out_t, h2_out_t, h3_out_t = \
                [_apply_norm(x, self.layer_norm) for x in to_normalize]

            readout_t = h1_out_t + h2_out_t + h3_out_t

            readout_t += self.att_to_readout.apply(w_t)

            if self.use_speaker:
                readout_t += spk_readout

            output_t = self.readout_to_output.apply(readout_t)

            if self.which_cost == 'MSE':
                predicted_x_t = output_t
                if self.use_speaker:
                    predicted_x_t += spk_output

                # Dummy value for coeff_t
                coeff_t = predicted_x_t
            elif self.which_cost == "GMM":
                mu_t, sigma_t, coeff_t = output_t
                if self.use_speaker:
                    mu_t += spk_output[0]
                    sigma_t += spk_output[1]
                    coeff_t += spk_output[2]

                sigma_t = tensor.exp(sigma_t - self.sampling_bias) + \
                    self.epsilon

                coeff_t = tensor.nnet.softmax(
                    coeff_t.reshape(
                        (-1, self.k_gmm)) * (1. + self.sampling_bias)).reshape(
                            coeff_t.shape) + self.epsilon

                predicted_x_t = sample_gmm(mu_t, sigma_t, coeff_t,
                                           self.theano_rng)

            return predicted_x_t, h1_t, h2_t, h3_t, \
                k_t, w_t, coeff_t, phi_t, a_t_

        (sample_x, h1, h2, h3, k, w, pi, phi, pi_att), updates = theano.scan(
            fn=sample_step,
            sequences=[cell_h1, gat_h1, cell_h2, gat_h2, cell_h3, gat_h3],
            non_sequences=[],
            outputs_info=[
                initial_x, initial_h1, initial_h2, initial_h3, initial_k,
                initial_w, None, None, None
            ])

        return sample_x, k, w, pi, phi, pi_att, updates

    def sample_model(self, labels_tr, labels_mask_tr, features_mask_tr,
                     speaker_tr, num_samples, num_steps):

        features, features_mask, labels, labels_mask, speaker, start_flag = \
            self.symbolic_input_variables()

        sample_x, k, w, pi, phi, pi_att, updates = \
            self.sample_model_fun(
                labels, labels_mask, speaker,
                num_samples, num_steps)

        theano_inputs = [labels, labels_mask]
        numpy_inputs = (labels_tr, labels_mask_tr)

        if self.use_speaker:
            theano_inputs += [speaker]
            numpy_inputs += (speaker_tr, )

        return function(theano_inputs, [sample_x, k, w, pi, phi, pi_att],
                        updates=updates)(*numpy_inputs)

    def sample_using_input(self, data_tr, num_samples):
        # Used to predict the values using the dataset

        features, features_mask, labels, labels_mask, speaker, start_flag = \
            self.symbolic_input_variables()

        cost, updates, attention_vars = self.compute_cost(
            features, features_mask, labels, labels_mask, speaker, start_flag,
            num_samples)
        sample_x, k, w, pi, phi, pi_att = attention_vars

        theano_vars = [
            features, features_mask, labels, labels_mask, speaker, start_flag
        ]
        theano_vars = [x for x in theano_vars if x is not None]
        theano_vars = list(set(theano_vars))
        theano_vars = {x.name: x for x in theano_vars}

        theano_inputs = []
        numpy_inputs = []

        for key in data_tr.keys():
            theano_inputs.append(theano_vars[key])
            numpy_inputs.append(data_tr[key])

        return function(theano_inputs, [sample_x, k, w, pi, phi, pi_att],
                        updates=updates)(*numpy_inputs)
Exemplo n.º 52
0
def build_model_soft(vocab_size, args, dtype=floatX):
    logger.info('Building model ...')

    # Parameters for the model
    context = args.context
    state_dim = args.state_dim
    layers = args.layers
    skip_connections = args.skip_connections

    # Symbolic variables
    # In both cases: Time X Batch
    x = tensor.lmatrix('features')
    y = tensor.lmatrix('targets')

    # Build the model
    output_names = []
    output_dims = []
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if d == 0 or skip_connections:
            output_names.append("inputs" + suffix)
            output_dims.append(state_dim)

    lookup = LookupTable(length=vocab_size, dim=state_dim)
    lookup.weights_init = initialization.IsotropicGaussian(0.1)
    lookup.biases_init = initialization.Constant(0)

    fork = Fork(output_names=output_names, input_dim=args.mini_batch_size,
                output_dims=output_dims,
                prototype=FeedforwardSequence(
                    [lookup.apply]))

    transitions = [SimpleRecurrent(dim=state_dim, activation=Tanh())]

    # Build the MLP
    dims = [2 * state_dim]
    activations = []
    for i in range(args.mlp_layers):
        activations.append(Rectifier())
        dims.append(state_dim)

    # Activation of the last layer of the MLP
    if args.mlp_activation == "logistic":
        activations.append(Logistic())
    elif args.mlp_activation == "rectifier":
        activations.append(Rectifier())
    elif args.mlp_activation == "hard_logistic":
        activations.append(HardLogistic())
    else:
        assert False

    # Output of MLP has dimension 1
    dims.append(1)

    for i in range(layers - 1):
        mlp = MLP(activations=activations, dims=dims,
                  weights_init=initialization.IsotropicGaussian(0.1),
                  biases_init=initialization.Constant(0),
                  name="mlp_" + str(i))
        transitions.append(
            SoftGatedRecurrent(dim=state_dim,
                               mlp=mlp,
                               activation=Tanh()))

    rnn = RecurrentStack(transitions, skip_connections=skip_connections)

    # dim = layers * state_dim
    output_layer = Linear(
        input_dim=layers * state_dim,
        output_dim=vocab_size, name="output_layer")

    # Return list of 3D Tensor, one for each layer
    # (Time X Batch X embedding_dim)
    pre_rnn = fork.apply(x)

    # Give a name to the input of each layer
    if skip_connections:
        for t in range(len(pre_rnn)):
            pre_rnn[t].name = "pre_rnn_" + str(t)
    else:
        pre_rnn.name = "pre_rnn"

    # Prepare inputs for the RNN
    kwargs = OrderedDict()
    init_states = {}
    for d in range(layers):
        if d > 0:
            suffix = '_' + str(d)
        else:
            suffix = ''
        if skip_connections:
            kwargs['inputs' + suffix] = pre_rnn[d]
        elif d == 0:
            kwargs['inputs' + suffix] = pre_rnn
        init_states[d] = theano.shared(
            numpy.zeros((args.mini_batch_size, state_dim)).astype(floatX),
            name='state0_%d' % d)
        kwargs['states' + suffix] = init_states[d]

    # Apply the RNN to the inputs
    h = rnn.apply(low_memory=True, **kwargs)

    # Now we have:
    # h = [state, state_1, gate_value_1, state_2, gate_value_2, state_3, ...]

    # Extract gate_values
    gate_values = h[2::2]
    new_h = [h[0]]
    new_h.extend(h[1::2])
    h = new_h

    # Now we have:
    # h = [state, state_1, state_2, ...]
    # gate_values = [gate_value_1, gate_value_2, gate_value_3]

    for i, gate_value in enumerate(gate_values):
        gate_value.name = "gate_value_" + str(i)

    # Save all the last states
    last_states = {}
    for d in range(layers):
        last_states[d] = h[d][-1, :, :]

    # Concatenate all the states
    if layers > 1:
        h = tensor.concatenate(h, axis=2)
    h.name = "hidden_state"

    # The updates of the hidden states
    updates = []
    for d in range(layers):
        updates.append((init_states[d], last_states[d]))

    presoft = output_layer.apply(h[context:, :, :])
    # Define the cost
    # Compute the probability distribution
    time, batch, feat = presoft.shape
    presoft.name = 'presoft'

    cross_entropy = Softmax().categorical_cross_entropy(
        y[context:, :].flatten(),
        presoft.reshape((batch * time, feat)))
    cross_entropy = cross_entropy / tensor.log(2)
    cross_entropy.name = "cross_entropy"

    # TODO: add regularisation for the cost
    # the log(1) is here in order to differentiate the two variables
    # for monitoring
    cost = cross_entropy + tensor.log(1)
    cost.name = "regularized_cost"

    # Initialize the model
    logger.info('Initializing...')

    fork.initialize()

    rnn.weights_init = initialization.Orthogonal()
    rnn.biases_init = initialization.Constant(0)
    rnn.initialize()

    output_layer.weights_init = initialization.IsotropicGaussian(0.1)
    output_layer.biases_init = initialization.Constant(0)
    output_layer.initialize()

    return cost, cross_entropy, updates, gate_values
Exemplo n.º 53
0
class BidiRNN(Initializable):
    @lazy()
    def __init__(self, config, output_dim=2, **kwargs):
        super(BidiRNN, self).__init__(**kwargs)
        self.config = config

        self.context_embedder = ContextEmbedder(config)
        
        act = config.rec_activation() if hasattr(config, 'rec_activation') else None
        self.rec = SegregatedBidirectional(LSTM(dim=config.hidden_state_dim, activation=act, name='recurrent'))

        self.fwd_fork = Fork([name for name in self.rec.prototype.apply.sequences if name!='mask'],
                             prototype=Linear(), name='fwd_fork')
        self.bkwd_fork = Fork([name for name in self.rec.prototype.apply.sequences if name!='mask'],
                              prototype=Linear(), name='bkwd_fork')

        rto_in = config.hidden_state_dim * 2 + sum(x[2] for x in config.dim_embeddings)
        self.rec_to_output = MLP(activations=[Rectifier() for _ in config.dim_hidden] + [Identity()], 
                                 dims=[rto_in] + config.dim_hidden + [output_dim])

        self.sequences = ['latitude', 'latitude_mask', 'longitude']
        self.inputs = self.sequences + self.context_embedder.inputs

        self.children = [ self.context_embedder, self.fwd_fork, self.bkwd_fork,
                          self.rec, self.rec_to_output ]

    def _push_allocation_config(self):
        for i, fork in enumerate([self.fwd_fork, self.bkwd_fork]):
            fork.input_dim = 2
            fork.output_dims = [ self.rec.children[i].get_dim(name)
                                 for name in fork.output_names ]

    def _push_initialization_config(self):
        for brick in [self.fwd_fork, self.bkwd_fork, self.rec, self.rec_to_output]:
            brick.weights_init = self.config.weights_init
            brick.biases_init = self.config.biases_init

    def process_outputs(self, outputs):
        pass # must be implemented in child class

    @application(outputs=['destination'])
    def predict(self, latitude, longitude, latitude_mask, **kwargs):
        latitude = (latitude.T - data.train_gps_mean[0]) / data.train_gps_std[0]
        longitude = (longitude.T - data.train_gps_mean[1]) / data.train_gps_std[1]
        latitude_mask = latitude_mask.T

        rec_in = tensor.concatenate((latitude[:, :, None], longitude[:, :, None]), axis=2)

        last_id = tensor.cast(latitude_mask.sum(axis=0) - 1, dtype='int64')

        path = self.rec.apply(merge(self.fwd_fork.apply(rec_in, as_dict=True),
                                    {'mask': latitude_mask}),
                              merge(self.bkwd_fork.apply(rec_in, as_dict=True),
                                    {'mask': latitude_mask}))[0]

        path_representation = (path[0][:, -self.config.hidden_state_dim:],
                               path[last_id - 1, tensor.arange(latitude_mask.shape[1])]
                                   [:, :self.config.hidden_state_dim])

        embeddings = tuple(self.context_embedder.apply(
                        **{k: kwargs[k] for k in self.context_embedder.inputs }))

        inputs = tensor.concatenate(path_representation + embeddings, axis=1)
        outputs = self.rec_to_output.apply(inputs)

        return self.process_outputs(outputs)

    @predict.property('inputs')
    def predict_inputs(self):
        return self.inputs

    @application(outputs=['cost'])
    def cost(self, **kwargs):
        y_hat = self.predict(**kwargs)
        y = tensor.concatenate((kwargs['destination_latitude'][:, None],
                                kwargs['destination_longitude'][:, None]), axis=1)

        return error.erdist(y_hat, y).mean()

    @cost.property('inputs')
    def cost_inputs(self):
        return self.inputs + ['destination_latitude', 'destination_longitude']