Пример #1
0
def make_bidir_lstm_stack(seq, seq_dim, mask, sizes, skip=True, name=''):
    bricks = []

    curr_dim = [seq_dim]
    curr_hidden = [seq]

    hidden_list = []
    for k, dim in enumerate(sizes):
        fwd_lstm_ins = [Linear(input_dim=d, output_dim=4*dim, name='%s_fwd_lstm_in_%d_%d'%(name,k,l)) for l, d in enumerate(curr_dim)]
        fwd_lstm = LSTM(dim=dim, activation=Tanh(), name='%s_fwd_lstm_%d'%(name,k))

        bwd_lstm_ins = [Linear(input_dim=d, output_dim=4*dim, name='%s_bwd_lstm_in_%d_%d'%(name,k,l)) for l, d in enumerate(curr_dim)]
        bwd_lstm = LSTM(dim=dim, activation=Tanh(), name='%s_bwd_lstm_%d'%(name,k))

        bricks = bricks + [fwd_lstm, bwd_lstm] + fwd_lstm_ins + bwd_lstm_ins

        fwd_tmp = sum(x.apply(v) for x, v in zip(fwd_lstm_ins, curr_hidden))
        bwd_tmp = sum(x.apply(v) for x, v in zip(bwd_lstm_ins, curr_hidden))
        fwd_hidden, _ = fwd_lstm.apply(fwd_tmp, mask=mask)
        bwd_hidden, _ = bwd_lstm.apply(bwd_tmp[::-1], mask=mask[::-1])
        hidden_list = hidden_list + [fwd_hidden, bwd_hidden]
        if skip:
            curr_hidden = [seq, fwd_hidden, bwd_hidden[::-1]]
            curr_dim = [seq_dim, dim, dim]
        else:
            curr_hidden = [fwd_hidden, bwd_hidden[::-1]]
            curr_dim = [dim, dim]

    return bricks, hidden_list
Пример #2
0
def make_bidir_lstm_stack(seq, seq_dim, mask, sizes, skip=True, name=''):
    bricks = []

    curr_dim = [seq_dim]
    curr_hidden = [seq]

    hidden_list = []
    for k, dim in enumerate(sizes):
        fwd_lstm_ins = [Linear(input_dim=d, output_dim=4*dim, name='%s_fwd_lstm_in_%d_%d'%(name,k,l)) for l, d in enumerate(curr_dim)]
        fwd_lstm = LSTM(dim=dim, activation=Tanh(), name='%s_fwd_lstm_%d'%(name,k))

        bwd_lstm_ins = [Linear(input_dim=d, output_dim=4*dim, name='%s_bwd_lstm_in_%d_%d'%(name,k,l)) for l, d in enumerate(curr_dim)]
        bwd_lstm = LSTM(dim=dim, activation=Tanh(), name='%s_bwd_lstm_%d'%(name,k))

        bricks = bricks + [fwd_lstm, bwd_lstm] + fwd_lstm_ins + bwd_lstm_ins

        fwd_tmp = sum(x.apply(v) for x, v in zip(fwd_lstm_ins, curr_hidden))
        bwd_tmp = sum(x.apply(v) for x, v in zip(bwd_lstm_ins, curr_hidden))
        fwd_hidden, _ = fwd_lstm.apply(fwd_tmp, mask=mask)
        bwd_hidden, _ = bwd_lstm.apply(bwd_tmp[::-1], mask=mask[::-1])
        hidden_list = hidden_list + [fwd_hidden, bwd_hidden]
        if skip:
            curr_hidden = [seq, fwd_hidden, bwd_hidden[::-1]]
            curr_dim = [seq_dim, dim, dim]
        else:
            curr_hidden = [fwd_hidden, bwd_hidden[::-1]]
            curr_dim = [dim, dim]

    return bricks, hidden_list
Пример #3
0
def lstm_layer(in_size, dim, x, h, n, task, first_layer=False):
    if connect_h_to_h == 'all-previous':
        if first_layer:
            lstm_input = x
            linear = Linear(input_dim=in_size,
                            output_dim=dim * 4,
                            name='linear' + str(n) + '-' + str(task))
        elif connect_x_to_h:
            lstm_input = T.concatenate([x] + [hidden for hidden in h], axis=2)
            linear = Linear(input_dim=in_size + dim * (n),
                            output_dim=dim * 4,
                            name='linear' + str(n) + '-' + str(task))
        else:
            lstm_input = T.concatenate([hidden for hidden in h], axis=2)
            linear = Linear(input_dim=dim * (n + 1),
                            output_dim=dim * 4,
                            name='linear' + str(n) + '-' + str(task))
    elif connect_h_to_h == 'two-previous':
        if first_layer:
            lstm_input = x
            linear = Linear(input_dim=in_size,
                            output_dim=dim * 4,
                            name='linear' + str(n) + '-' + str(task))
        elif connect_x_to_h:
            lstm_input = T.concatenate([x] + h[max(0, n - 2):n], axis=2)
            linear = Linear(input_dim=in_size + dim * 2 if n > 1 else in_size +
                            dim,
                            output_dim=dim * 4,
                            name='linear' + str(n) + '-' + str(task))
        else:
            lstm_input = T.concatenate(h[max(0, n - 2):n], axis=2)
            linear = Linear(input_dim=dim * 2 if n > 1 else dim,
                            output_dim=dim * 4,
                            name='linear' + str(n) + '-' + str(task))
    elif connect_h_to_h == 'one-previous':
        if first_layer:
            lstm_input = x
            linear = Linear(input_dim=in_size,
                            output_dim=dim * 4,
                            name='linear' + str(n) + '-' + str(task))
        elif connect_x_to_h:
            lstm_input = T.concatenate([x] + [h[n - 1]], axis=2)
            linear = Linear(input_dim=in_size + dim,
                            output_dim=dim * 4,
                            name='linear' + str(n) + '-' + str(task))
        else:
            lstm_input = h[n - 1]
            linear = Linear(input_dim=dim,
                            output_dim=dim * 4,
                            name='linear' + str(n) + '-' + str(task))
    lstm = LSTM(dim=dim, name=layer_models[n] + str(n) + '-' + str(task))
    initialize([linear, lstm])
    if layer_models[n] == 'lstm':
        return lstm.apply(linear.apply(lstm_input))
    elif layer_models[n] == 'mt_lstm':
        return lstm.apply(linear.apply(lstm_input),
                          time_scale=layer_resolutions[n],
                          time_offset=layer_execution_time_offset[n])
Пример #4
0
def lstm_layer(dim, h, n, x_mask, first, **kwargs):
    linear = Linear(input_dim=dim, output_dim=dim * 4, name='linear' + str(n))
    lstm = LSTM(dim=dim, activation=Rectifier(), name='lstm' + str(n))
    initialize([linear, lstm])
    applyLin = linear.apply(h)

    if first:
        lstmApply = lstm.apply(applyLin, mask=x_mask, **kwargs)[0]
    else:
        lstmApply = lstm.apply(applyLin, **kwargs)[0]
    return lstmApply
Пример #5
0
class questionEncoder:
    def __init__(self, word_dim, hidden_dim):
        self.forward_lstm= LSTM(hidden_dim,
                                name='question_forward_lstm',
                                weights_init=IsotropicGaussian(0.01),
                                biases_init=Constant(0))
        self.backward_lstm= LSTM(hidden_dim,
                                 name='question_backward_lstm',
                                 weights_init=IsotropicGaussian(0.01),
                                 biases_init=Constant(0))
        self.x_to_h_forward = Linear(word_dim,
                                     hidden_dim * 4,
                                     name='word_x_to_h_forward',
                                     weights_init=IsotropicGaussian(0.01),
                                     biases_init=Constant(0))
        self.x_to_h_backward = Linear(word_dim,
                                      hidden_dim * 4,
                                      name='word_x_to_h_backward',
                                      weights_init=IsotropicGaussian(0.01),
                                      biases_init=Constant(0))

        self.forward_lstm.initialize()
        self.backward_lstm.initialize()
        self.x_to_h_forward.initialize()
        self.x_to_h_backward.initialize()

    # variable question length
    # words: batch_size x q x word_dim
    # words_reverse: be the reverse sentence of words
    #                padding with 0 to max length q
    # mask: batch_size 
    def apply(self, words, words_reverse, mask_, batch_size):
        mask = mask_.flatten()
        # batch_size x q x hidden_dim
        Wx = self.x_to_h_forward.apply(words)
        Wx_r = self.x_to_h_backward.apply(words_reverse)
        # q x batch_size x hidden_dim
        Wx = Wx.swapaxes(0, 1)
        Wx_r = Wx_r.swapaxes(0, 1)
        # q x batch_size x hidden_dim
        hf, cf = self.forward_lstm.apply(Wx)
        hb, cb = self.backward_lstm.apply(Wx_r)
        for i in range(batch_size):
            T.set_subtensor(hb[0:mask[i]+1, i, :], hb[0:mask[i]+1, i, :][::-1])

        # q x batch_size x (2 x hidden_dim)
        h = T.concatenate([hf, hb], axis=2)
        # batch_size x hidden_dim
        y_q = hf[mask, range(batch_size), :]
        y_1 = hb[0, range(batch_size), :]
        
        return h.swapaxes(0, 1), y_q, y_1
Пример #6
0
def example4():
    """LSTM -> Plante lors de l'initialisation du lstm."""

    x = tensor.tensor3('x')
    dim = 3

    #    gate_inputs = theano.function([x],x*4)
    gate_inputs = Linear(input_dim=dim,
                         output_dim=dim * 4,
                         name="linear",
                         weights_init=initialization.Identity(),
                         biases_init=Constant(2))

    lstm = LSTM(dim=dim,
                activation=Tanh(),
                weights_init=IsotropicGaussian(),
                biases_init=Constant(0))

    gate_inputs.initialize()
    hg = gate_inputs.apply(x)

    #print(gate_inputs.parameters)
    #print(gate_inputs.parameters[1].get_value())

    lstm.initialize()
    h, cells = lstm.apply(hg)
    print(lstm.parameters)

    f = theano.function([x], h)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
    print(f(4 * np.ones((dim, 1, dim), dtype=theano.config.floatX)))

    print("Good Job!")

    #    lstm_output =

    #Initial State
    h0 = tensor.matrix('h0')
    c = tensor.matrix('cells')
    h, c1 = lstm.apply(
        inputs=x, states=h0,
        cells=c)  # lstm.apply(states=h0,cells=cells,inputs=gate_inputs)

    f = theano.function([x, h0, c], h)
    print("a")
    print(
        f(np.ones((3, 1, 3), dtype=theano.config.floatX),
          np.ones((1, 3), dtype=theano.config.floatX),
          np.ones((1, 3), dtype=theano.config.floatX)))
Пример #7
0
    def __init__(self, input_size, hidden_size, output_size):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size

        x = tensor.tensor3('x', dtype=floatX)
        y = tensor.tensor3('y', dtype=floatX)

        x_to_lstm = Linear(name="x_to_lstm", input_dim=input_size, output_dim=4 * hidden_size,
                           weights_init=IsotropicGaussian(), biases_init=Constant(0))
        lstm = LSTM(dim=hidden_size, name="lstm", weights_init=IsotropicGaussian(), biases_init=Constant(0))
        lstm_to_output = Linear(name="lstm_to_output", input_dim=hidden_size, output_dim=output_size,
                                weights_init=IsotropicGaussian(), biases_init=Constant(0))

        x_transform = x_to_lstm.apply(x)
        h, c = lstm.apply(x_transform)

        y_hat = lstm_to_output.apply(h)
        y_hat = Logistic(name="y_hat").apply(y_hat)

        self.cost = BinaryCrossEntropy(name="cost").apply(y, y_hat)

        x_to_lstm.initialize()
        lstm.initialize()
        lstm_to_output.initialize()

        self.computation_graph = ComputationGraph(self.cost)
Пример #8
0
    def create_model(self):
        input_dim = self.input_dim
        x = self.x
        x_to_h = Linear(input_dim,
                        input_dim * 4,
                        name='x_to_h',
                        weights_init=IsotropicGaussian(),
                        biases_init=Constant(0.0))
        lstm = LSTM(input_dim,
                    name='lstm',
                    weights_init=IsotropicGaussian(),
                    biases_init=Constant(0.0))
        h_to_o = Linear(input_dim,
                        1,
                        name='h_to_o',
                        weights_init=IsotropicGaussian(),
                        biases_init=Constant(0.0))

        x_transform = x_to_h.apply(x)
        self.x_to_h = x_to_h
        self.lstm = lstm
        self.h_to_o = h_to_o

        h, c = lstm.apply(x_transform)

        # only values of hidden units of the last timeframe are used for
        # the classification
        probs = h_to_o.apply(h[-1])
        return probs
Пример #9
0
    def apply(self, input_, target):
        x_to_h = Linear(name='x_to_h',
                        input_dim=self.dims[0],
                        output_dim=self.dims[1] * 4)
        pre_rnn = x_to_h.apply(input_)
        pre_rnn.name = 'pre_rnn'
        rnn = LSTM(activation=Tanh(), dim=self.dims[1], name=self.name)
        h, _ = rnn.apply(pre_rnn)
        h.name = 'h'
        h_to_y = Linear(name='h_to_y',
                        input_dim=self.dims[1],
                        output_dim=self.dims[2])
        y_hat = h_to_y.apply(h)
        y_hat.name = 'y_hat'

        cost = SquaredError().apply(target, y_hat)
        cost.name = 'MSE'

        self.outputs = {}
        self.outputs['y_hat'] = y_hat
        self.outputs['cost'] = cost
        self.outputs['pre_rnn'] = pre_rnn
        self.outputs['h'] = h

        # Initialization
        for brick in (rnn, x_to_h, h_to_y):
            brick.weights_init = IsotropicGaussian(0.01)
            brick.biases_init = Constant(0)
            brick.initialize()
Пример #10
0
class CoreNetwork(BaseRecurrent, Initializable):
    def __init__(self, input_dim, dim, **kwargs):
        super(CoreNetwork, self).__init__(**kwargs)
        self.input_dim = input_dim
        self.dim = dim
        self.lstm = LSTM(dim=dim, name=self.name + '_lstm',
                         weights_init=self.weights_init,
                         biases_init=self.biases_init)

        self.proj = Linear(input_dim=input_dim, output_dim=dim*4,
                           name=self.name + '_proj',
                           weights_init=self.weights_init,
                           biases_init=self.biases_init)
        self.children = [self.lstm, self.proj]

    def get_dim(self, name):
        if name == 'inputs':
            return self.input_dim
        elif name in ['state', 'cell']:
            return self.dim
        else:
            raise ValueError

    @recurrent(sequences=['inputs'], states=['state', 'cell'], contexts=[],
               outputs=['state', 'cell'])
    def apply(self, inputs, state, cell):
        state, cell = self.lstm.apply(self.proj.apply(inputs), state, cell,
                                      iterate=False)
        return state, cell
Пример #11
0
    def apply(self, input_, target):
        x_to_h = Linear(name='x_to_h',
                        input_dim=self.dims[0],
                        output_dim=self.dims[1] * 4)
        pre_rnn = x_to_h.apply(input_)
        pre_rnn.name = 'pre_rnn'
        rnn = LSTM(activation=Tanh(),
                   dim=self.dims[1], name=self.name)
        h, _ = rnn.apply(pre_rnn)
        h.name = 'h'
        h_to_y = Linear(name='h_to_y',
                        input_dim=self.dims[1],
                        output_dim=self.dims[2])
        y_hat = h_to_y.apply(h)
        y_hat.name = 'y_hat'

        cost = SquaredError().apply(target, y_hat)
        cost.name = 'MSE'

        self.outputs = {}
        self.outputs['y_hat'] = y_hat
        self.outputs['cost'] = cost
        self.outputs['pre_rnn'] = pre_rnn
        self.outputs['h'] = h

        # Initialization
        for brick in (rnn, x_to_h, h_to_y):
            brick.weights_init = IsotropicGaussian(0.01)
            brick.biases_init = Constant(0)
            brick.initialize()
Пример #12
0
def lstm_layer(in_dim, h, h_dim, n, pref=""):
    linear = Linear(input_dim=in_dim,
                    output_dim=h_dim * 4,
                    name='linear' + str(n) + pref)
    lstm = LSTM(dim=h_dim, name='lstm' + str(n) + pref)
    initialize([linear, lstm])
    return lstm.apply(linear.apply(h))[0]
Пример #13
0
def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs):
    dataset_train = IterableDataset(generate_data(max_seq_length, batch_size,
                                                  num_batches))
    dataset_test = IterableDataset(generate_data(max_seq_length, batch_size,
                                                 100))

    stream_train = DataStream(dataset=dataset_train)
    stream_test = DataStream(dataset=dataset_test)

    x = T.tensor3('x')
    y = T.matrix('y')

    # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see
    # LSTM layer documentation for the explanation
    x_to_h = Linear(1, lstm_dim * 4, name='x_to_h',
                    weights_init=IsotropicGaussian(),
                    biases_init=Constant(0.0))
    lstm = LSTM(lstm_dim, name='lstm',
                weights_init=IsotropicGaussian(),
                biases_init=Constant(0.0))
    h_to_o = Linear(lstm_dim, 1, name='h_to_o',
                    weights_init=IsotropicGaussian(),
                    biases_init=Constant(0.0))

    x_transform = x_to_h.apply(x)
    h, c = lstm.apply(x_transform)

    # only values of hidden units of the last timeframe are used for
    # the classification
    y_hat = h_to_o.apply(h[-1])
    y_hat = Logistic().apply(y_hat)

    cost = BinaryCrossEntropy().apply(y, y_hat)
    cost.name = 'cost'

    lstm.initialize()
    x_to_h.initialize()
    h_to_o.initialize()

    cg = ComputationGraph(cost)

    algorithm = GradientDescent(cost=cost, parameters=cg.parameters,
                                step_rule=Adam())
    test_monitor = DataStreamMonitoring(variables=[cost],
                                        data_stream=stream_test, prefix="test")
    train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train",
                                           after_epoch=True)

    main_loop = MainLoop(algorithm, stream_train,
                         extensions=[test_monitor, train_monitor,
                                     FinishAfter(after_n_epochs=num_epochs),
                                     Printing(), ProgressBar()])
    main_loop.run()

    print 'Learned weights:'
    for layer in (x_to_h, lstm, h_to_o):
        print "Layer '%s':" % layer.name
        for param in layer.parameters:
            print param.name, ': ', param.get_value()
        print
Пример #14
0
class LinearLSTM(Initializable):
    def __init__(self,
                 input_dim,
                 output_dim,
                 lstm_dim,
                 print_intermediate=False,
                 print_attrs=['__str__'],
                 **kwargs):
        super(LinearLSTM, self).__init__(**kwargs)

        self.x_to_h = Linear(input_dim,
                             lstm_dim * 4,
                             name='x_to_h',
                             weights_init=IsotropicGaussian(),
                             biases_init=Constant(0.0))
        self.lstm = LSTM(lstm_dim,
                         name='lstm',
                         weights_init=IsotropicGaussian(),
                         biases_init=Constant(0.0))
        self.h_to_o = Linear(lstm_dim,
                             output_dim,
                             name='h_to_o',
                             weights_init=IsotropicGaussian(),
                             biases_init=Constant(0.0))

        self.children = [self.x_to_h, self.lstm, self.h_to_o]

        self.print_intermediate = print_intermediate
        self.print_attrs = print_attrs

    @application
    def apply(self, source):

        x_linear = self.x_to_h.apply(
            source.reshape(
                (source.shape[1], source.shape[0], source.shape[2])))
        x_linear.name = 'x_linear'
        if self.print_intermediate:
            x_linear = Print(message='x_linear info',
                             attrs=self.print_attrs)(x_linear)

        h, c = self.lstm.apply(x_linear)
        if self.print_intermediate:
            h = Print(message="hidden states info", attrs=self.print_attrs)(h)

        y_hat = self.h_to_o.apply(h)
        y_hat.name = 'y_hat'
        if self.print_intermediate:
            y_hat = Print(message="y_hat info", attrs=self.print_attrs)(y_hat)

        return y_hat

    def initialize(self):
        for child in self.children:
            child.initialize()

    def reset_allocation(self):
        for child in self.children:
            child.allocated = False
Пример #15
0
class seqDecoder:
    def __init__(self, feature_dim, memory_dim, fc1_dim, fc2_dim):
        self.W = Linear(input_dim=feature_dim,
                        output_dim=memory_dim * 4,
                        weights_init=IsotropicGaussian(0.01),
                        biases_init=Constant(0),
                        use_bias=False,
                        name='seqDecoder_W')
        self.GRU_A = LSTM(feature_dim,
                          name='seqDecoder_A',
                          weights_init=IsotropicGaussian(0.01),
                          biases_init=Constant(0))
        self.GRU_B = LSTM(memory_dim,
                          name='seqDecoder_B',
                          weights_init=IsotropicGaussian(0.01),
                          biases_init=Constant(0))
        self.W.initialize()
        self.GRU_A.initialize()
        self.GRU_B.initialize()
        self.fc1 = Linear(input_dim=memory_dim,
                          output_dim=fc1_dim,
                          weights_init=IsotropicGaussian(0.01),
                          biases_init=Constant(0),
                          name='fc1')
        self.fc2 = Linear(input_dim=fc1_dim,
                          output_dim=fc2_dim,
                          weights_init=IsotropicGaussian(0.01),
                          biases_init=Constant(0),
                          name='fc2')

        self.fc1.initialize()
        self.fc2.initialize()

    # A: the encoding of GRU_A,
    # B: the encoding of GRU_B
    # padding: the tensor constant
    def apply(self, output_length, A, B, padding):
        A_, garbage = self.GRU_A.apply(padding, states=A)
        WA_ = self.W.apply(A_)
        # output_length x batch_size x output_dim
        B_, garbage = self.GRU_B.apply(WA_, states=B)
        # batch_size x output_length x output_dim
        B_ =  B_.swapaxes(0,1)
        fc1_r = relu(self.fc1.apply(B_))
        fc2_r = relu(self.fc2.apply(fc1_r))
        return fc2_r
Пример #16
0
def construct_model(activation_function, r_dim, hidden_dim, out_dim):
    # Construct the model
    r = tensor.fmatrix('r')
    x = tensor.fmatrix('x')
    y = tensor.ivector('y')

    nx = x.shape[0]
    nj = x.shape[1]  # also is r.shape[0]
    nr = r.shape[1]

    # r is nj x nr
    # x is nx x nj
    # y is nx

    # Get a representation of r of size r_dim
    r = DAE(r)

    # r is now nj x r_dim

    # r_rep is nx x nj x r_dim
    r_rep = r[None, :, :].repeat(axis=0, repeats=nx)
    # x3 is nx x nj x 1
    x3 = x[:, :, None]

    # concat is nx x nj x (r_dim + 1)
    concat = tensor.concatenate([r_rep, x3], axis=2)

    # Change concat from Batch x Time x Features to T X B x F
    rnn_input = concat.dimshuffle(1, 0, 2)

    linear = Linear(input_dim=r_dim + 1,
                    output_dim=4 * hidden_dim,
                    name="input_linear")
    lstm = LSTM(dim=hidden_dim,
                activation=activation_function,
                name="hidden_recurrent")
    top_linear = Linear(input_dim=hidden_dim,
                        output_dim=out_dim,
                        name="out_linear")

    pre_rnn = linear.apply(rnn_input)
    states = lstm.apply(pre_rnn)[0]
    activations = top_linear.apply(states)
    activations = tensor.mean(activations, axis=0)

    cost = Softmax().categorical_cross_entropy(y, activations)

    pred = activations.argmax(axis=1)
    error_rate = tensor.neq(y, pred).mean()

    # Initialize parameters

    for brick in (linear, lstm, top_linear):
        brick.weights_init = IsotropicGaussian(0.1)
        brick.biases_init = Constant(0.)
        brick.initialize()

    return cost, error_rate
Пример #17
0
def create_rnn(hidden_dim, vocab_dim,mode="rnn"):
    # input
    x = tensor.imatrix('inchar')
    y = tensor.imatrix('outchar')

    # 
    W = LookupTable(
        name = "W1",
        #dim = hidden_dim*4,
        dim = hidden_dim,
        length = vocab_dim,
        weights_init = initialization.IsotropicGaussian(0.01),
        biases_init = initialization.Constant(0)
    )
    if mode == "lstm":
        # Long Short Term Memory
        H = LSTM(
            hidden_dim, 
            name = 'H',
            weights_init = initialization.IsotropicGaussian(0.01),
            biases_init = initialization.Constant(0.0)
        )
    else:
        # recurrent history weight
        H = SimpleRecurrent(
            name = "H",
            dim = hidden_dim,
            activation = Tanh(),
            weights_init = initialization.IsotropicGaussian(0.01)
        )
    # 
    S = Linear(
        name = "W2",
        input_dim = hidden_dim,
        output_dim = vocab_dim,
        weights_init = initialization.IsotropicGaussian(0.01),
        biases_init = initialization.Constant(0)
    )

    A = NDimensionalSoftmax(
        name = "softmax"
    )

    initLayers([W,H,S])
    activations = W.apply(x)
    hiddens = H.apply(activations)#[0]
    activations2 = S.apply(hiddens)
    y_hat = A.apply(activations2, extra_ndim=1)
    cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean()

    cg = ComputationGraph(cost)
    #print VariableFilter(roles=[WEIGHT])(cg.variables)
    #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables)

    layers = (x, W, H, S, A, y)

    return  cg, layers, y_hat, cost
Пример #18
0
def add_lstm(input_dim, input_var):
    linear = Linear(input_dim=input_dim,output_dim=input_dim*4,name="linear_layer")
    lstm = LSTM(dim=input_dim, name="lstm_layer")

    testing_init(linear)
    #linear.initialize()
    default_init(lstm)

    h = linear.apply(input_var)
    return lstm.apply(h)
Пример #19
0
    def create_model(self):
        input_dim = self.input_dim
        x = self.x
        y = self.y
        p = self.p
        mask = self.mask
        hidden_dim = self.hidden_dim
        embedding_dim = self.embedding_dim
        lookup = LookupTable(self.dict_size,
                             embedding_dim,
                             weights_init=IsotropicGaussian(0.001),
                             name='LookupTable')
        x_to_h = Linear(embedding_dim,
                        hidden_dim * 4,
                        name='x_to_h',
                        weights_init=IsotropicGaussian(0.001),
                        biases_init=Constant(0.0))
        lstm = LSTM(hidden_dim,
                    name='lstm',
                    weights_init=IsotropicGaussian(0.001),
                    biases_init=Constant(0.0))
        h_to_o = MLP([Logistic()], [hidden_dim, 1],
                     weights_init=IsotropicGaussian(0.001),
                     biases_init=Constant(0),
                     name='h_to_o')

        lookup.initialize()
        x_to_h.initialize()
        lstm.initialize()
        h_to_o.initialize()

        embed = lookup.apply(x).reshape(
            (x.shape[0], x.shape[1], self.embedding_dim))
        embed.name = "embed_vec"
        x_transform = x_to_h.apply(embed.transpose(1, 0, 2))
        x_transform.name = "Transformed X"
        self.lookup = lookup
        self.x_to_h = x_to_h
        self.lstm = lstm
        self.h_to_o = h_to_o

        #if mask is None:
        h, c = lstm.apply(x_transform)
        #else:
        #h, c = lstm.apply(x_transform, mask=mask)
        h.name = "hidden_state"
        c.name = "cell state"
        # only values of hidden units of the last timeframe are used for
        # the classification
        indices = T.sum(mask, axis=0) - 1
        rel_hid = h[indices, T.arange(h.shape[1])]
        out = self.h_to_o.apply(rel_hid)

        probs = out
        return probs
def construct_model(activation_function, r_dim, hidden_dim, out_dim):
    # Construct the model
    r = tensor.fmatrix('r')
    x = tensor.fmatrix('x')
    y = tensor.ivector('y')

    nx = x.shape[0]
    nj = x.shape[1]  # also is r.shape[0]
    nr = r.shape[1]

    # r is nj x nr
    # x is nx x nj
    # y is nx

    # Get a representation of r of size r_dim
    r = DAE(r)

    # r is now nj x r_dim

    # r_rep is nx x nj x r_dim
    r_rep = r[None, :, :].repeat(axis=0, repeats=nx)
    # x3 is nx x nj x 1
    x3 = x[:, :, None]

    # concat is nx x nj x (r_dim + 1)
    concat = tensor.concatenate([r_rep, x3], axis=2)

    # Change concat from Batch x Time x Features to T X B x F
    rnn_input = concat.dimshuffle(1, 0, 2)

    linear = Linear(input_dim=r_dim + 1, output_dim=4 * hidden_dim,
                    name="input_linear")
    lstm = LSTM(dim=hidden_dim, activation=activation_function,
                name="hidden_recurrent")
    top_linear = Linear(input_dim=hidden_dim, output_dim=out_dim,
                        name="out_linear")

    pre_rnn = linear.apply(rnn_input)
    states = lstm.apply(pre_rnn)[0]
    activations = top_linear.apply(states)
    activations = tensor.mean(activations, axis=0)

    cost = Softmax().categorical_cross_entropy(y, activations)

    pred = activations.argmax(axis=1)
    error_rate = tensor.neq(y, pred).mean()

    # Initialize parameters

    for brick in (linear, lstm, top_linear):
        brick.weights_init = IsotropicGaussian(0.1)
        brick.biases_init = Constant(0.)
        brick.initialize()

    return cost, error_rate
Пример #21
0
def example4():
    """LSTM -> Plante lors de l'initialisation du lstm."""

    x = tensor.tensor3('x')
    dim=3

#    gate_inputs = theano.function([x],x*4)
    gate_inputs = Linear(input_dim=dim,output_dim=dim*4, name="linear",weights_init=initialization.Identity(), biases_init=Constant(2))

    lstm = LSTM(dim=dim,activation=Tanh(), weights_init=IsotropicGaussian(), biases_init=Constant(0))
    
    gate_inputs.initialize()
    hg = gate_inputs.apply(x)
    

    #print(gate_inputs.parameters)
    #print(gate_inputs.parameters[1].get_value())
    
    lstm.initialize()
    h, cells = lstm.apply(hg)
    print(lstm.parameters)
    
    f = theano.function([x], h)
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
    print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX)))
    print(f(4*np.ones((dim, 1, dim), dtype=theano.config.floatX)))
 
    print("Good Job!")


#    lstm_output = 

    #Initial State
    h0 = tensor.matrix('h0')
    c =  tensor.matrix('cells')
    h,c1 = lstm.apply(inputs=x, states=h0, cells=c) # lstm.apply(states=h0,cells=cells,inputs=gate_inputs)

    f = theano.function([x, h0, c], h)
    print("a")
    print(f(np.ones((3, 1, 3), dtype=theano.config.floatX),
            np.ones((1, 3), dtype=theano.config.floatX),
            np.ones((1, 3), dtype=theano.config.floatX))) 
 def lstm_layer(self, h, n):
     """
     Performs the LSTM update for a batch of word sequences
     :param h The word embeddings for this update
     :param n The number of layers of the LSTM
     """
     # Maps the word embedding to a dimensionality to be used in the LSTM
     linear = Linear(input_dim=self.hidden_size, output_dim=self.hidden_size * 4, name='linear_lstm' + str(n))
     initialize(linear, sqrt(6.0 / (5 * self.hidden_size)))
     lstm = LSTM(dim=self.hidden_size, name='lstm' + str(n))
     initialize(lstm, 0.08)
     return lstm.apply(linear.apply(h))
Пример #23
0
def create_rnn(hidden_dim, vocab_dim, mode="rnn"):
    # input
    x = tensor.imatrix('inchar')
    y = tensor.imatrix('outchar')

    #
    W = LookupTable(
        name="W1",
        #dim = hidden_dim*4,
        dim=hidden_dim,
        length=vocab_dim,
        weights_init=initialization.IsotropicGaussian(0.01),
        biases_init=initialization.Constant(0))
    if mode == "lstm":
        # Long Short Term Memory
        H = LSTM(hidden_dim,
                 name='H',
                 weights_init=initialization.IsotropicGaussian(0.01),
                 biases_init=initialization.Constant(0.0))
    else:
        # recurrent history weight
        H = SimpleRecurrent(
            name="H",
            dim=hidden_dim,
            activation=Tanh(),
            weights_init=initialization.IsotropicGaussian(0.01))
    #
    S = Linear(name="W2",
               input_dim=hidden_dim,
               output_dim=vocab_dim,
               weights_init=initialization.IsotropicGaussian(0.01),
               biases_init=initialization.Constant(0))

    A = NDimensionalSoftmax(name="softmax")

    initLayers([W, H, S])
    activations = W.apply(x)
    hiddens = H.apply(activations)  #[0]
    activations2 = S.apply(hiddens)
    y_hat = A.apply(activations2, extra_ndim=1)
    cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean()

    cg = ComputationGraph(cost)
    #print VariableFilter(roles=[WEIGHT])(cg.variables)
    #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables)

    layers = (x, W, H, S, A, y)

    return cg, layers, y_hat, cost
Пример #24
0
    def __init__(self, input1_size, input2_size, lookup1_dim=200, lookup2_dim=200, hidden_size=512):
        self.hidden_size = hidden_size
        self.input1_size = input1_size
        self.input2_size = input2_size
        self.lookup1_dim = lookup1_dim
        self.lookup2_dim = lookup2_dim

        x1 = tensor.lmatrix('durations')
        x2 = tensor.lmatrix('syllables')
        y = tensor.lmatrix('pitches')

        lookup1 = LookupTable(dim=self.lookup1_dim, length=self.input1_size, name='lookup1',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup1.initialize()
        lookup2 = LookupTable(dim=self.lookup2_dim, length=self.input2_size, name='lookup2',
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        lookup2.initialize()
        merge = Merge(['lookup1', 'lookup2'], [self.lookup1_dim, self.lookup2_dim], self.hidden_size,
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        merge.initialize()
        recurrent_block = LSTM(dim=self.hidden_size, activation=Tanh(),
                              weights_init=initialization.Uniform(width=0.01)) #RecurrentStack([LSTM(dim=self.hidden_size, activation=Tanh())] * 3)
        recurrent_block.initialize()
        linear = Linear(input_dim=self.hidden_size, output_dim=self.input1_size,
                              weights_init=initialization.Uniform(width=0.01),
                              biases_init=Constant(0))
        linear.initialize()
        softmax = NDimensionalSoftmax()

        l1 = lookup1.apply(x1)
        l2 = lookup2.apply(x2)
        m = merge.apply(l1, l2)
        h = recurrent_block.apply(m)
        a = linear.apply(h)

        y_hat = softmax.apply(a, extra_ndim=1)
        # ValueError: x must be 1-d or 2-d tensor of floats. Got TensorType(float64, 3D)

        self.Cost = softmax.categorical_cross_entropy(y, a, extra_ndim=1).mean()

        self.ComputationGraph = ComputationGraph(self.Cost)

        self.Model = Model(y_hat)
Пример #25
0
class Encoder(Initializable):

    def __init__(self, image_feature_dim, embedding_dim, **kwargs):
        super(Encoder, self).__init__(**kwargs)

        self.image_embedding = Linear(
              input_dim=image_feature_dim
            , output_dim=embedding_dim
            # , weights_init=IsotropicGaussian(0.02)
            # , biases_init=Constant(0.)
            , name="image_embedding"
            )

        self.to_inputs = Linear(
              input_dim=embedding_dim
            , output_dim=embedding_dim*4 # gate_inputs = vstack(input, forget, cell, hidden)
            # , weights_init=IsotropicGaussian(0.02)
            # , biases_init=Constant(0.)
            , name="to_inputs"
            )

        # Don't think this dim has to also be dimension, more arbitrary
        self.transition = LSTM(
            dim=embedding_dim, name="transition")

        self.children = [ self.image_embedding
                        , self.to_inputs
                        , self.transition
                        ]

    @application(inputs=['image_vects', 'word_vects'], outputs=['image_embedding', 'sentence_embedding'])   
    def apply(self, image_vects, word_vects):

        image_embedding = self.image_embedding.apply(image_vects)

        # inputs = word_vects
        inputs = self.to_inputs.apply(word_vects)
        inputs = inputs.dimshuffle(1, 0, 2)
        hidden, cells = self.transition.apply(inputs=inputs, mask=None)

        # the last hidden state represents the accumulation of all the words (i.e. the sentence)
        # grab all batches, grab the last value representing accumulation of the sequence, grab all features
        sentence_embedding = hidden[-1]
        # sentence_embedding = inputs.mean(axis=0)
        return image_embedding, sentence_embedding
class Encoder(Initializable):

    def __init__(self, image_feature_dim, embedding_dim, **kwargs):
        super(Encoder, self).__init__(**kwargs)

        self.image_embedding = Linear(
              input_dim=image_feature_dim
            , output_dim=embedding_dim
            , name="image_embedding"
            )

        self.to_inputs = Linear(
              input_dim=embedding_dim
            , output_dim=embedding_dim*4 # times 4 cuz vstack(input, forget, cell, hidden)
            , name="to_inputs"
            )

        self.transition = LSTM(
            dim=embedding_dim, name="transition")

        self.children = [ self.image_embedding
                        , self.to_inputs
                        , self.transition
                        ]

    @application(
          inputs=['image_vects', 'word_vects']
        , outputs=['image_embedding', 'sentence_embedding']
        )   
    def apply(self, image_vects, word_vects):

        image_embedding = self.image_embedding.apply(image_vects)

        inputs = self.to_inputs.apply(word_vects)
        
        # shuffle dimensions to correspond to (sequence, batch, features)
        inputs = inputs.dimshuffle(1, 0, 2)
        
        hidden, cells = self.transition.apply(inputs=inputs, mask=None)

        # last hidden state represents the accumulation of word embeddings 
        # (i.e. the sentence embedding)
        sentence_embedding = hidden[-1]

        return image_embedding, sentence_embedding
class Encoder(Initializable):
    def __init__(self, image_feature_dim, embedding_dim, **kwargs):
        super(Encoder, self).__init__(**kwargs)

        self.image_embedding = Linear(input_dim=image_feature_dim,
                                      output_dim=embedding_dim,
                                      name="image_embedding")

        self.to_inputs = Linear(
            input_dim=embedding_dim,
            output_dim=embedding_dim *
            4  # times 4 cuz vstack(input, forget, cell, hidden)
            ,
            name="to_inputs")

        self.transition = LSTM(dim=embedding_dim, name="transition")

        self.children = [self.image_embedding, self.to_inputs, self.transition]

    @application(inputs=['image_vects', 'word_vects'],
                 outputs=['image_embedding', 'sentence_embedding'])
    def apply(self, image_vects, word_vects):

        image_embedding = self.image_embedding.apply(image_vects)

        inputs = self.to_inputs.apply(word_vects)

        # shuffle dimensions to correspond to (sequence, batch, features)
        inputs = inputs.dimshuffle(1, 0, 2)

        hidden, cells = self.transition.apply(inputs=inputs, mask=None)

        # last hidden state represents the accumulation of word embeddings
        # (i.e. the sentence embedding)
        sentence_embedding = hidden[-1]

        return image_embedding, sentence_embedding
Пример #28
0
    def build_theano_functions(self) :
        #import pdb ; pdb.set_trace()
        x = T.fmatrix('x')
        s = T.fvector('s')
        mu = T.fvector('mu')
        mu = T.reshape(mu,(self.number_of_mix,1))
        pi = T.fvector('pi')

        lstm = LSTM(
            dim=self.input_dim/4,
            weights_init=IsotropicGaussian(0.5),
            biases_init=Constant(1))
        lstm.initialize()
        h, c = lstm.apply(x)
        h = h[0][0][-1]

        LL = T.sum(pi*(1./(T.sqrt(2.*np.pi)*s))*T.exp(\
            -0.5*(h-mu)**2/T.reshape(s,(self.number_of_mix,1))**2.).sum(axis=1))
        cost = -T.log(LL)

        #cg = ComputationGraph(cost)
        #self.cg = cg
        #parameters = cg.parameters
        model = Model(cost)
        self.model = model
        parameters = model.parameters

        grads = T.grad(cost, parameters)
        updates = []
        for i in range(len(grads)) :
            updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]]))

        gradf = theano.function([x,s,mu,pi],[cost],updates=updates)
        f = theano.function([x],[h])

        return gradf, f
Пример #29
0
    def build_theano_functions(self):
        #import pdb ; pdb.set_trace()
        x = T.fmatrix('x')
        s = T.fvector('s')
        mu = T.fvector('mu')
        mu = T.reshape(mu, (self.number_of_mix, 1))
        pi = T.fvector('pi')

        lstm = LSTM(dim=self.input_dim / 4,
                    weights_init=IsotropicGaussian(0.5),
                    biases_init=Constant(1))
        lstm.initialize()
        h, c = lstm.apply(x)
        h = h[0][0][-1]

        LL = T.sum(pi*(1./(T.sqrt(2.*np.pi)*s))*T.exp(\
            -0.5*(h-mu)**2/T.reshape(s,(self.number_of_mix,1))**2.).sum(axis=1))
        cost = -T.log(LL)

        #cg = ComputationGraph(cost)
        #self.cg = cg
        #parameters = cg.parameters
        model = Model(cost)
        self.model = model
        parameters = model.parameters

        grads = T.grad(cost, parameters)
        updates = []
        for i in range(len(grads)):
            updates.append(
                tuple([parameters[i], parameters[i] - self.lr * grads[i]]))

        gradf = theano.function([x, s, mu, pi], [cost], updates=updates)
        f = theano.function([x], [h])

        return gradf, f
Пример #30
0
class LSTMReadDefinitions(Initializable):
    """
    Converts definition into embeddings.

    Parameters
    ----------
    num_input_words: int, default: -1
        If non zero will (a bit confusing name) restrict dynamically vocab.
        WARNING: it assumes word ids are monotonical with frequency!

    emb_dim : int
        Dimensionality of word embeddings

    dim : int
        Dimensionality of the def rnn.

    lookup: None or LookupTable

    fork_and_rnn: None or tuple (Linear, RNN)
    """
    def __init__(self,
                 num_input_words,
                 emb_dim,
                 dim,
                 vocab,
                 lookup=None,
                 fork_and_rnn=None,
                 **kwargs):

        if num_input_words > 0:
            logger.info("Restricting def vocab to " + str(num_input_words))
            self._num_input_words = num_input_words
        else:
            self._num_input_words = vocab.size()

        self._vocab = vocab

        children = []

        if lookup is None:
            self._def_lookup = LookupTable(self._num_input_words,
                                           emb_dim,
                                           name='def_lookup')
        else:
            self._def_lookup = lookup

        if fork_and_rnn is None:
            self._def_fork = Linear(emb_dim, 4 * dim, name='def_fork')
            self._def_rnn = LSTM(dim, name='def_rnn')
        else:
            self._def_fork, self._def_rnn = fork_and_rnn

        children.extend([self._def_lookup, self._def_fork, self._def_rnn])

        super(LSTMReadDefinitions, self).__init__(children=children, **kwargs)

    @application
    def apply(self, application_call, defs, def_mask):
        """
        Returns vector per each word in sequence using the dictionary based lookup
        """
        # Short listing
        defs = (T.lt(defs, self._num_input_words) * defs +
                T.ge(defs, self._num_input_words) * self._vocab.unk)
        application_call.add_auxiliary_variable(unk_ratio(
            defs, def_mask, self._vocab.unk),
                                                name='def_unk_ratio')

        embedded_def_words = self._def_lookup.apply(defs)
        def_embeddings = self._def_rnn.apply(T.transpose(
            self._def_fork.apply(embedded_def_words), (1, 0, 2)),
                                             mask=def_mask.T)[0][-1]

        return def_embeddings
Пример #31
0
class Rnn(Initializable, BaseRecurrent):
    def __init__(self, dims=(88, 100, 100), **kwargs):
        super(Rnn, self).__init__(**kwargs)
        self.dims = dims

        self.input_transform = Linear(input_dim=dims[0], output_dim=dims[1],
                                      weights_init=IsotropicGaussian(0.01),
                                      # biases_init=Constant(0.0),
                                      use_bias=False,
                                      name="input_transfrom")

        self.gru_layer = SimpleRecurrent(dim=dims[1], activation=Tanh(),
                                         weights_init=IsotropicGaussian(0.01),
                                         biases_init=Constant(0.0),
                                         use_bias=True,
                                         name="gru_rnn_layer")

        # TODO: find a way to automatically set the output dim in case of lstm vs normal rnn
        self.linear_trans = Linear(input_dim=dims[1], output_dim=dims[2] * 4,
                                   weights_init=IsotropicGaussian(0.01),
                                   biases_init=Constant(0.0),
                                   use_bias=False,
                                   name="h2h_transform")

        self.lstm_layer = LSTM(dim=dims[2], activation=Tanh(),
                               weights_init=IsotropicGaussian(0.01),
                               biases_init=Constant(0.0),
                               use_bias=True,
                               name="lstm_rnn_layer")

        self.out_transform = MLP(activations=[Sigmoid()], dims=[dims[2], dims[0]],
                                 weights_init=IsotropicGaussian(0.01),
                                 use_bias=True,
                                 biases_init=Constant(0.0),
                                 name="out_layer")

        self.children = [self.input_transform, self.gru_layer, self.linear_trans,
                         self.lstm_layer, self.out_transform]

    # @recurrent(sequences=['inputs', 'input_mask'], contexts=[],
    # states=['gru_state', 'lstm_state', 'lstm_cells'],
    # outputs=['gru_state', 'lstm_state', 'lstm_cells'])
    def rnn_apply(self, inputs, mask=None, gru_state=None, lstm_state=None, lstm_cells=None):
        input_transform = self.input_transform.apply(inputs)
        gru_state = self.gru_layer.apply(
            inputs=input_transform,
            # update_inputs=input_transform,
            # reset_inputs=input_transform,
            states=gru_state,
            mask=mask,
            iterate=False)
        lstm_transform = self.linear_trans.apply(gru_state)
        lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state,
                                                       cells=lstm_cells,
                                                       mask=mask, iterate=False)
        return gru_state, lstm_state, lstm_cells

    @recurrent(sequences=[], contexts=[],
               states=['inputs', 'gru_state', 'lstm_state', 'lstm_cells'],
               outputs=['inputs', 'gru_state', 'lstm_state', 'lstm_cells'])
    def rnn_generate(self, inputs=None, gru_state=None, lstm_state=None, lstm_cells=None):
        output = self.apply(inputs=inputs,
                            gru_state=gru_state,
                            lstm_state=lstm_state,
                            lstm_cells=lstm_cells,
                            iterate=False)
        return output, gru_state, lstm_state, lstm_cells


    @recurrent(sequences=['inputs', 'mask'], contexts=[],
               states=['gru_state', 'lstm_state', 'lstm_cells'],
               outputs=['output', 'gru_state', 'lstm_state', 'lstm_cells'])
    def apply(self, inputs, mask, gru_state=None, lstm_state=None, lstm_cells=None):
        # input_transform = self.input_transform.apply(inputs)
        # gru_state = self.gru_layer.apply(
        # inputs=input_transform,
        #     mask=mask,
        #     states=gru_state,
        #     iterate=False)
        # lstm_transform = self.linear_trans.apply(gru_state)
        # lstm_state, lstm_cells = self.lstm_layer.apply(inputs=lstm_transform, states=lstm_state,
        #                                                cells=lstm_cells,
        #                                                mask=mask, iterate=False)
        gru_state, lstm_state, lstm_cells = self.rnn_apply(inputs=inputs,
                                                           mask=mask,
                                                           gru_state=gru_state,
                                                           lstm_state=lstm_state,
                                                           lstm_cells=lstm_cells)

        output = 1.17 * self.out_transform.apply(lstm_state) * mask[:, None]
        return output, gru_state, lstm_state, lstm_cells


    def get_dim(self, name):
        dims = dict(zip(['outputs', 'gru_state', 'lstm_state'], self.dims))
        dims['lstm_cells'] = dims['lstm_state']
        return dims.get(name, None) or super(Rnn, self).get_dim(name)
Пример #32
0
                  k = k,
                  const = 0.00001)

bricks = [mlp_x, transition, mlp_gmm]

for brick in bricks:
    brick.weights_init = IsotropicGaussian(0.01)
    brick.biases_init = Constant(0.)
    brick.initialize()

##############
# Test model
##############

x_g = mlp_x.apply(x)
h = transition.apply(x_g)
mu, sigma, coeff = mlp_gmm.apply(h[-2])

#from theano import function
#x_tr, x_mask_tr, y_tr = next(data_stream.get_epoch_iterator())
#print function([x], x_g)(x_tr).shape
#print function([x], h)(x_tr)[-2].shape
#print function([x], mu)(x_tr).shape

from play.utils import GMM
cost = GMM(y, mu, sigma, coeff)
cost = cost*x_mask
cost = cost.sum()/x_mask.sum()
cost.name = 'sequence_log_likelihood'

cg = ComputationGraph(cost)
Пример #33
0
def rf_lstm_experiment(data_name, exp_network, in_dim, out_dim, num_layers,
                       start_neurons, num_neurons, batch_size, num_epochs):
    """LSTM Experiment."""
    # load dataset
    train_set = IterableDataset(
        ds.transform_sequence(data_name, "train", batch_size))
    test_set = IterableDataset(
        ds.transform_sequence(data_name, "test", batch_size))
    stream_train = DataStream(dataset=train_set)
    stream_test = DataStream(dataset=test_set)
    methods = ['sgd', 'momentum', 'adagrad', 'rmsprop']

    for n_layers in xrange(1, num_layers + 1):
        for n_neurons in xrange(start_neurons, num_neurons + 5, 5):
            for method in methods:
                X = T.tensor3("features")
                y = T.matrix("targets")

                x_to_h = Linear(in_dim,
                                n_neurons * 4,
                                name='x_to_h',
                                weights_init=IsotropicGaussian(),
                                biases_init=Constant(0.0))
                lstm = LSTM(n_neurons,
                            name='lstm',
                            weights_init=IsotropicGaussian(),
                            biases_init=Constant(0.0))

                h_to_o = nc.setup_ff_network(n_neurons, out_dim, n_layers - 1,
                                             n_neurons)

                X_trans = x_to_h.apply(X)
                h, c = lstm.apply(X_trans)
                y_hat = h_to_o.apply(h[-1])
                cost, cg = nc.create_cg_and_cost(y, y_hat, "none")

                lstm.initialize()
                x_to_h.initialize()
                h_to_o.initialize()

                algorithm = nc.setup_algorithms(cost, cg, method, type="RNN")

                test_monitor = DataStreamMonitoring(variables=[cost],
                                                    data_stream=stream_test,
                                                    prefix="test")
                train_monitor = TrainingDataMonitoring(variables=[cost],
                                                       prefix="train",
                                                       after_epoch=True)

                main_loop = MainLoop(
                    algorithm=algorithm,
                    data_stream=stream_train,
                    extensions=[
                        test_monitor, train_monitor,
                        FinishAfter(after_n_epochs=num_epochs),
                        Printing(),
                        ProgressBar()
                    ])

                main_loop.run()

                # Saving results
                exp_id = ds.create_exp_id(exp_network, n_layers, n_neurons,
                                          batch_size, num_epochs, method,
                                          "none")

                # prepare related functions
                predict = theano.function([X], y_hat)

                # prepare related data
                train_features, train_targets = ds.get_iter_data(train_set)
                test_features, test_targets = ds.get_iter_data(test_set)

                # Prediction of result
                train_predicted = gen_prediction(predict, train_features)
                test_predicted = gen_prediction(predict, test_features)

                # Get cost
                cost = ds.get_cost_data(test_monitor, train_set.num_examples,
                                        num_epochs)

                # logging
                ds.save_experiment(train_targets, train_predicted,
                                   test_targets, test_predicted, cost,
                                   exp_network, n_layers, n_neurons,
                                   batch_size, num_epochs, method, "none",
                                   exp_id, "../results/")
class ExtractiveQAModel(Initializable):
    """The dictionary-equipped extractive QA model.

    Parameters
    ----------
    dim : int
        The default dimensionality for the components.
    emd_dim : int
        The dimensionality for the embeddings. If 0, `dim` is used.
    coattention : bool
        Use the coattention mechanism.
    num_input_words : int
        The number of input words. If 0, `vocab.size()` is used.
        The vocabulary object.
    use_definitions : bool
        Triggers the use of definitions.
    reuse_word_embeddings : bool
    compose_type : str

    """
    def __init__(self, dim, emb_dim, readout_dims, num_input_words,
                 def_num_input_words, vocab, use_definitions, def_word_gating,
                 compose_type, coattention, def_reader, reuse_word_embeddings,
                 random_unk, **kwargs):
        self._vocab = vocab
        if emb_dim == 0:
            emb_dim = dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if def_num_input_words == 0:
            def_num_input_words = num_input_words

        self._coattention = coattention
        self._num_input_words = num_input_words
        self._use_definitions = use_definitions
        self._random_unk = random_unk
        self._reuse_word_embeddings = reuse_word_embeddings

        lookup_num_words = num_input_words
        if reuse_word_embeddings:
            lookup_num_words = max(num_input_words, def_num_input_words)
        if random_unk:
            lookup_num_words = vocab.size()

        # Dima: we can have slightly less copy-paste here if we
        # copy the RecurrentFromFork class from my other projects.
        children = []
        self._lookup = LookupTable(lookup_num_words, emb_dim)
        self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
        self._encoder_rnn = LSTM(dim, name='encoder_rnn')
        self._question_transform = Linear(dim, dim, name='question_transform')
        self._bidir_fork = Linear(3 * dim if coattention else 2 * dim,
                                  4 * dim,
                                  name='bidir_fork')
        self._bidir = Bidirectional(LSTM(dim), name='bidir')
        children.extend([
            self._lookup, self._encoder_fork, self._encoder_rnn,
            self._question_transform, self._bidir, self._bidir_fork
        ])

        activations = [Rectifier()] * len(readout_dims) + [None]
        readout_dims = [2 * dim] + readout_dims + [1]
        self._begin_readout = MLP(activations,
                                  readout_dims,
                                  name='begin_readout')
        self._end_readout = MLP(activations, readout_dims, name='end_readout')
        self._softmax = NDimensionalSoftmax()
        children.extend(
            [self._begin_readout, self._end_readout, self._softmax])

        if self._use_definitions:
            # A potential bug here: we pass the same vocab to the def reader.
            # If a different token is reserved for UNK in text and in the definitions,
            # we can be screwed.
            def_reader_class = eval(def_reader)
            def_reader_kwargs = dict(
                num_input_words=def_num_input_words,
                dim=dim,
                emb_dim=emb_dim,
                vocab=vocab,
                lookup=self._lookup if reuse_word_embeddings else None)
            if def_reader_class == MeanPoolReadDefinitions:
                def_reader_kwargs.update(dict(normalize=True, translate=False))
            self._def_reader = def_reader_class(**def_reader_kwargs)
            self._combiner = MeanPoolCombiner(dim=dim,
                                              emb_dim=emb_dim,
                                              def_word_gating=def_word_gating,
                                              compose_type=compose_type)
            children.extend([self._def_reader, self._combiner])

        super(ExtractiveQAModel, self).__init__(children=children, **kwargs)

        # create default input variables
        self.contexts = tensor.lmatrix('contexts')
        self.context_mask = tensor.matrix('contexts_mask')
        self.questions = tensor.lmatrix('questions')
        self.question_mask = tensor.matrix('questions_mask')
        self.answer_begins = tensor.lvector('answer_begins')
        self.answer_ends = tensor.lvector('answer_ends')
        input_vars = [
            self.contexts, self.context_mask, self.questions,
            self.question_mask, self.answer_begins, self.answer_ends
        ]
        if self._use_definitions:
            self.defs = tensor.lmatrix('defs')
            self.def_mask = tensor.matrix('def_mask')
            self.contexts_def_map = tensor.lmatrix('contexts_def_map')
            self.questions_def_map = tensor.lmatrix('questions_def_map')
            input_vars.extend([
                self.defs, self.def_mask, self.contexts_def_map,
                self.questions_def_map
            ])
        self.input_vars = OrderedDict([(var.name, var) for var in input_vars])

    def set_embeddings(self, embeddings):
        self._lookup.parameters[0].set_value(
            embeddings.astype(theano.config.floatX))

    def embeddings_var(self):
        return self._lookup.parameters[0]

    def def_reading_parameters(self):
        parameters = Selector(self._def_reader).get_parameters().values()
        parameters.extend(Selector(self._combiner).get_parameters().values())
        if self._reuse_word_embeddings:
            lookup_parameters = Selector(
                self._lookup).get_parameters().values()
            parameters = [p for p in parameters if p not in lookup_parameters]
        return parameters

    @application
    def _encode(self,
                application_call,
                text,
                mask,
                def_embs=None,
                def_map=None,
                text_name=None):
        if not self._random_unk:
            text = (tensor.lt(text, self._num_input_words) * text +
                    tensor.ge(text, self._num_input_words) * self._vocab.unk)
        if text_name:
            application_call.add_auxiliary_variable(
                unk_ratio(text, mask, self._vocab.unk),
                name='{}_unk_ratio'.format(text_name))
        embs = self._lookup.apply(text)
        if self._random_unk:
            embs = (tensor.lt(text, self._num_input_words)[:, :, None] * embs +
                    tensor.ge(text, self._num_input_words)[:, :, None] *
                    disconnected_grad(embs))
        if def_embs:
            embs = self._combiner.apply(embs, mask, def_embs, def_map)
        add_role(embs, EMBEDDINGS)
        encoded = flip01(
            self._encoder_rnn.apply(self._encoder_fork.apply(flip01(embs)),
                                    mask=mask.T)[0])
        return encoded

    @application
    def apply(self,
              application_call,
              contexts,
              contexts_mask,
              questions,
              questions_mask,
              answer_begins,
              answer_ends,
              defs=None,
              def_mask=None,
              contexts_def_map=None,
              questions_def_map=None):
        def_embs = None
        if self._use_definitions:
            def_embs = self._def_reader.apply(defs, def_mask)

        context_enc = self._encode(contexts, contexts_mask, def_embs,
                                   contexts_def_map, 'context')
        question_enc_pre = self._encode(questions, questions_mask, def_embs,
                                        questions_def_map, 'question')
        question_enc = tensor.tanh(
            self._question_transform.apply(question_enc_pre))

        # should be (batch size, context length, question_length)
        affinity = tensor.batched_dot(context_enc, flip12(question_enc))
        affinity_mask = contexts_mask[:, :, None] * questions_mask[:, None, :]
        affinity = affinity * affinity_mask - 1000.0 * (1 - affinity_mask)
        # soft-aligns every position in the context to positions in the question
        d2q_att_weights = self._softmax.apply(affinity, extra_ndim=1)
        application_call.add_auxiliary_variable(d2q_att_weights.copy(),
                                                name='d2q_att_weights')
        # soft-aligns every position in the question to positions in the document
        q2d_att_weights = self._softmax.apply(flip12(affinity), extra_ndim=1)
        application_call.add_auxiliary_variable(q2d_att_weights.copy(),
                                                name='q2d_att_weights')

        # question encoding "in the view of the document"
        question_enc_informed = tensor.batched_dot(q2d_att_weights,
                                                   context_enc)
        question_enc_concatenated = tensor.concatenate(
            [question_enc, question_enc_informed], 2)
        # document encoding "in the view of the question"
        context_enc_informed = tensor.batched_dot(d2q_att_weights,
                                                  question_enc_concatenated)

        if self._coattention:
            context_enc_concatenated = tensor.concatenate(
                [context_enc, context_enc_informed], 2)
        else:
            question_repr_repeated = tensor.repeat(question_enc[:, [-1], :],
                                                   context_enc.shape[1],
                                                   axis=1)
            context_enc_concatenated = tensor.concatenate(
                [context_enc, question_repr_repeated], 2)

        # note: forward and backward LSTMs share the
        # input weights in the current impl
        bidir_states = flip01(
            self._bidir.apply(self._bidir_fork.apply(
                flip01(context_enc_concatenated)),
                              mask=contexts_mask.T)[0])

        begin_readouts = self._begin_readout.apply(bidir_states)[:, :, 0]
        begin_readouts = begin_readouts * contexts_mask - 1000.0 * (
            1 - contexts_mask)
        begin_costs = self._softmax.categorical_cross_entropy(
            answer_begins, begin_readouts)

        end_readouts = self._end_readout.apply(bidir_states)[:, :, 0]
        end_readouts = end_readouts * contexts_mask - 1000.0 * (1 -
                                                                contexts_mask)
        end_costs = self._softmax.categorical_cross_entropy(
            answer_ends, end_readouts)

        predicted_begins = begin_readouts.argmax(axis=-1)
        predicted_ends = end_readouts.argmax(axis=-1)
        exact_match = (tensor.eq(predicted_begins, answer_begins) *
                       tensor.eq(predicted_ends, answer_ends))
        application_call.add_auxiliary_variable(predicted_begins,
                                                name='predicted_begins')
        application_call.add_auxiliary_variable(predicted_ends,
                                                name='predicted_ends')
        application_call.add_auxiliary_variable(exact_match,
                                                name='exact_match')

        return begin_costs + end_costs

    def apply_with_default_vars(self):
        return self.apply(*self.input_vars.values())
Пример #35
0
#       RECURRENT LAYERS
rec_mask = conv_out_mask.dimshuffle(1, 0)
rec_in = conv_out[:, :, :, 0].dimshuffle(2, 0, 1)
rec_in_dim = conv_out_channels

rb = []
for i, p in enumerate(recs):
    # RNN bricks
    if p["type"] == "lstm":
        pre_rec = Linear(input_dim=rec_in_dim, output_dim=4 * p["dim"], name="rnn_linear%d" % i)
        rec = LSTM(activation=Tanh(), dim=p["dim"], name="rnn%d" % i)
        rb = rb + [pre_rec, rec]

        rnn_in = pre_rec.apply(rec_in)

        rec_out, _ = rec.apply(inputs=rnn_in, mask=rec_mask)
        dropout_b = [rec]
        rec_out_dim = p["dim"]
    elif p["type"] == "simple":
        pre_rec = Linear(input_dim=rec_in_dim, output_dim=p["dim"], name="rnn_linear%d" % i)
        rec = SimpleRecurrent(activation=Tanh(), dim=p["dim"], name="rnn%d" % i)
        rb = rb + [pre_rec, rec]

        rnn_in = pre_rec.apply(rec_in)

        rec_out = rec.apply(inputs=rnn_in, mask=rec_mask)
        dropout_b = [rec]
        rec_out_dim = p["dim"]
    elif p["type"] == "blstm":
        pre_frec = Linear(input_dim=rec_in_dim, output_dim=4 * p["dim"], name="frnn_linear%d" % i)
        pre_brec = Linear(input_dim=rec_in_dim, output_dim=4 * p["dim"], name="brnn_linear%d" % i)
class LanguageModel(Initializable):
    """
    This takes the word embeddings from LSTMCompositionalLayer and creates sentence embeddings using a LSTM

    compositional_layer_type can be:
        1) 'BidirectionalLSTMCompositionalLayer'
        2) 'UnidirectionalLSTMCompositionalLayer'
        3) 'BaselineLSTMCompositionalLayer'

    Input is a 3d tensor with the dimensions of (num_words, num_subwords, batch_size) and
    a 3d tensor a mask of size (num_words, num_subwords, batch_size)

    All hidden state sizes are the same as the subword embedding size

    This returns a 3d tensor with dimensions of
    (num_words = num RNN states, batch_size, sentence embedding size = LM_RNN_hidden_state_size = subword_RNN_hidden_state_size * 2)
    """

    def __init__(self, batch_size, num_subwords, num_words, subword_embedding_size, input_vocab_size,
                 subword_RNN_hidden_state_size, LM_RNN_hidden_state_size, table_width=0.08,
                 compositional_layer_type='BidirectionalLSTMCompositionalLayer', init_type='xavier', **kwargs):

        super(LanguageModel, self).__init__(**kwargs)
        self.batch_size = batch_size
        self.num_subwords = num_subwords # number of subwords which make up a word
        self.num_words = num_words  # number of words in the sentence
        self.subword_embedding_size = subword_embedding_size
        self.input_vocab_size = input_vocab_size
        self.subword_RNN_hidden_state_size = subword_RNN_hidden_state_size  #i.e. word embedding size
        self.LM_RNN_hidden_state_size = LM_RNN_hidden_state_size #i.e sentence embedding size
        self.table_width = table_width

        self.name = 'Language_Model'

        if init_type == 'xavier':
            linear_init = XavierInitializationOriginal(self.subword_RNN_hidden_state_size, self.LM_RNN_hidden_state_size)
            lstm_init = XavierInitializationOriginal(self.subword_RNN_hidden_state_size, self.LM_RNN_hidden_state_size)
        else:  # default is gaussian
            linear_init = IsotropicGaussian()
            lstm_init = IsotropicGaussian()


        self.compositional_layer = None
        self.linear = None
        if compositional_layer_type == 'BidirectionalLSTMCompositionalLayer':
            self.compositional_layer =  BidirectionalLSTMCompositionalLayer(self.batch_size, self.num_subwords, self.num_words,
                                                          self.subword_embedding_size, self.input_vocab_size,
                                                          self.subword_RNN_hidden_state_size, self.table_width,
                                                          init_type=init_type, name='compositional_layer')

            if init_type == 'xavier':
                linear_init = XavierInitializationOriginal(self.subword_RNN_hidden_state_size * 2, self.LM_RNN_hidden_state_size)
                lstm_init = XavierInitializationOriginal(self.subword_RNN_hidden_state_size * 2, self.LM_RNN_hidden_state_size)
            else:  # default is gaussian
                linear_init = IsotropicGaussian()
                lstm_init = IsotropicGaussian()

            self.linear = Linear(input_dim=self.subword_RNN_hidden_state_size * 2, #  2 * for the bidirectional
                                     output_dim=self.LM_RNN_hidden_state_size * 4,
                                     name='linear', weights_init=IsotropicGaussian(), biases_init=Constant(0.0))

        elif compositional_layer_type == 'UnidirectionalLSTMCompositionalLayer':
            self.compositional_layer =  LSTMCompositionalLayer(self.batch_size, self.num_subwords, self.num_words,
                                                          self.subword_embedding_size, self.input_vocab_size,
                                                          self.subword_RNN_hidden_state_size, self.table_width,
                                                          init_type=init_type, name='compositional_layer')

            self.linear = Linear(input_dim=self.subword_RNN_hidden_state_size,
                                     output_dim=self.LM_RNN_hidden_state_size * 4,
                                     name='linear', weights_init=IsotropicGaussian(), biases_init=Constant(0.0))

        elif compositional_layer_type == 'BaselineLSTMCompositionalLayer':
            self.compositional_layer =  BaselineLSTMCompositionalLayer(self.batch_size, self.num_subwords, self.num_words,
                                                          self.subword_embedding_size, self.input_vocab_size,
                                                          self.subword_RNN_hidden_state_size, self.table_width,
                                                          init_type=init_type, name='compositional_layer')

            self.linear = Linear(input_dim=self.subword_RNN_hidden_state_size,
                                     output_dim=self.LM_RNN_hidden_state_size * 4,
                                     name='linear', weights_init=IsotropicGaussian(), biases_init=Constant(0.0))

        else:
            print('ERROR: compositional_layer_type = ' + compositional_layer_type + ' is invalid')
            sys.exit()

        # has one RNN which reads the word embeddings into a sentence embedding, or partial sentence embeddings
        self.language_model_RNN = LSTM(
            dim=self.LM_RNN_hidden_state_size, activation=Identity(), name='language_model_RNN',
            weights_init=IsotropicGaussian(), biases_init=Constant(0.0))

        self.children = [self.compositional_layer, self.linear, self.language_model_RNN]


    @application(inputs=['subword_id_input_', 'subword_id_input_mask_'], outputs=['sentence_embeddings', 'word_embeddings_mask'])
    def apply(self, subword_id_input_, subword_id_input_mask_):
        """
        subword_id_input_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size).
        It is expected as a dtype=uint16 or equivalent

        subword_id_input_mask_ is a 3d tensor with the dimensions of shape = (num_words, num_subwords, batch_size).
        It is expected as a dtype=uint8 or equivalent and has binary values of 1 when there is data and zero otherwise.

        Returned is a 3d tensor of size (num_words = num RNN states, batch_size, sentence embedding size)
        Also returned is a 1d tensor of size (batch_size) describing if the sentence is valid of empty in the batch
        """
        word_embeddings, word_embeddings_mask = self.compositional_layer.apply(subword_id_input_, subword_id_input_mask_)
        sentence_embeddings = self.language_model_RNN.apply(
            self.linear.apply(word_embeddings), mask=word_embeddings_mask)[0] #[0] = hidden states, [1] = cells

        # sentence_embeddings_mask = word_embeddings_mask.max(axis=0).T

        return sentence_embeddings, word_embeddings_mask
Пример #37
0
def construct_model(input_dim, out_dim):
    # Construct the model
    r = tensor.fmatrix('r')
    x = tensor.fmatrix('x')
    y = tensor.ivector('y')

    nx = x.shape[0]
    nj = x.shape[1]  # also is r.shape[0]
    nr = r.shape[1]

    # r is nj x nr
    # x is nx x nj
    # y is nx

    # r_rep is nx x nj x nr
    r_rep = r[None, :, :].repeat(axis=0, repeats=nx)
    # x3 is nx x nj x 1
    x3 = x[:, :, None]

    # concat is nx x nj x (nr + 1)
    concat = tensor.concatenate([r_rep, x3], axis=2)

    # Change concat from Batch x Time x Features to T X B x F
    mlp_input = concat.dimshuffle(1, 0, 2)

    if use_ensembling:
        # Split time dimension into batches of size num_feats
        # Join that dimension with the B dimension
        ens_shape = (num_feats,
                     mlp_input.shape[0]/num_feats,
                     mlp_input.shape[1])
        mlp_input = mlp_input.reshape(ens_shape + (input_dim+1,))
        mlp_input = mlp_input.reshape((ens_shape[0], ens_shape[1] * ens_shape[2], input_dim+1))

    mlp = MLP(dims=[input_dim+1] + mlp_hidden_dims,
              activations=[activation_function for _ in mlp_hidden_dims],
              name='mlp')

    lstm_bot_linear = Linear(input_dim=mlp_hidden_dims[-1], output_dim=4 * lstm_hidden_dim,
                    name="lstm_input_linear")
    lstm = LSTM(dim=lstm_hidden_dim, activation=activation_function,
                name="hidden_recurrent")
    lstm_top_linear = Linear(input_dim=lstm_hidden_dim, output_dim=out_dim,
                        name="out_linear")

    rnn_input = mlp.apply(mlp_input)

    pre_rnn = lstm_bot_linear.apply(rnn_input)
    states = lstm.apply(pre_rnn)[0]
    activations = lstm_top_linear.apply(states)

    if use_ensembling:
        activations = activations.reshape(ens_shape + (out_dim,))
        # Unsplit batches (ensembling)
        activations = tensor.mean(activations, axis=1)

    # Mean over time
    activations = tensor.mean(activations, axis=0)

    cost = Softmax().categorical_cross_entropy(y, activations)

    pred = activations.argmax(axis=1)
    error_rate = tensor.neq(y, pred).mean()

    # Initialize parameters
    for brick in (mlp, lstm_bot_linear, lstm, lstm_top_linear):
        brick.weights_init = IsotropicGaussian(0.01)
        brick.biases_init = Constant(0.)
        brick.initialize()

    # apply noise
    cg = ComputationGraph([cost, error_rate])
    noise_vars = VariableFilter(roles=[WEIGHT])(cg)
    apply_noise(cg, noise_vars, noise_std)
    apply_dropout(cg, [rnn_input], dropout)
    [cost_reg, error_rate_reg] = cg.outputs

    return cost_reg, error_rate_reg, cost, error_rate
Пример #38
0
def lstm_layer(in_dim, h, h_dim, n, pref=""):
    linear = Linear(input_dim=in_dim, output_dim=h_dim * 4, name='linear' + str(n) + pref)
    lstm = LSTM(dim=h_dim, name='lstm' + str(n) + pref)
    initialize([linear, lstm])
    return lstm.apply(linear.apply(h))[0]
Пример #39
0
iteration = 100  # number of epochs of gradient descent
lr = 0.2  # learning rate

print "Building Model"
# Symbolic variables
x = tensor.tensor3('x', dtype=floatX)
target = tensor.tensor3('target', dtype=floatX)

# Build the model
linear = Linear(input_dim=n_u, output_dim=4 * n_h, name="first_layer")
lstm = LSTM(dim=n_h, activation=Tanh())
linear2 = Linear(input_dim=n_h, output_dim=n_y, name="output_layer")
sigm = Sigmoid()

x_transform = linear.apply(x)
h = lstm.apply(x_transform)[0]
predict = sigm.apply(linear2.apply(h))


# only for generation B x h_dim
h_initial = tensor.tensor3('h_initial', dtype=floatX)
h_testing = lstm.apply(x_transform, states=h_initial, iterate=False)[0]
y_hat_testing = linear2.apply(h_testing)
y_hat_testing = sigm.apply(y_hat_testing)
y_hat_testing.name = 'y_hat_testing'


# Cost function
cost = SquaredError().apply(predict, target)

# Initialization
Пример #40
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        context = tensor.imatrix('context')
        context_mask = tensor.imatrix('context_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        ans_indices = tensor.imatrix('ans_indices')  # n_steps * n_samples
        ans_indices_mask = tensor.imatrix('ans_indices_mask')

        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)
        context = context.dimshuffle(1, 0)
        context_mask = context_mask.dimshuffle(1, 0)
        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)
        ans_indices = ans_indices.dimshuffle(1, 0)
        ans_indices_mask = ans_indices_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='embed')
        embed.weights_init = IsotropicGaussian(0.01)
        # embed.weights_init = Constant(init_embedding_table(filename='embeddings/vocab_embeddings.txt'))

        # one directional LSTM encoding
        q_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='q_lstm_in')
        q_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='q_lstm')
        c_lstm_ins = Linear(input_dim=config.embed_size,
                            output_dim=4 * config.pre_lstm_size,
                            name='c_lstm_in')
        c_lstm = LSTM(dim=config.pre_lstm_size,
                      activation=Tanh(),
                      name='c_lstm')
        bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins]

        q_tmp = q_lstm_ins.apply(embed.apply(question))
        c_tmp = c_lstm_ins.apply(embed.apply(context))
        q_hidden, _ = q_lstm.apply(q_tmp,
                                   mask=question_mask.astype(
                                       theano.config.floatX))  # lq, bs, dim
        c_hidden, _ = c_lstm.apply(c_tmp,
                                   mask=context_mask.astype(
                                       theano.config.floatX))  # lc, bs, dim

        # Attention mechanism Bilinear question
        attention_question = Linear(input_dim=config.pre_lstm_size,
                                    output_dim=config.pre_lstm_size,
                                    name='att_question')
        bricks += [attention_question]
        att_weights_question = q_hidden[
            None, :, :, :] * attention_question.apply(
                c_hidden.reshape(
                    (c_hidden.shape[0] * c_hidden.shape[1],
                     c_hidden.shape[2]))).reshape(
                         (c_hidden.shape[0], c_hidden.shape[1],
                          c_hidden.shape[2]))[:,
                                              None, :, :]  # --> lc,lq,bs,dim
        att_weights_question = att_weights_question.sum(
            axis=3)  # sum over axis 3 -> dimensions --> lc,lq,bs
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,bs,lq
        att_weights_question = att_weights_question.reshape(
            (att_weights_question.shape[0] * att_weights_question.shape[1],
             att_weights_question.shape[2]))  # --> lc*bs,lq
        att_weights_question = tensor.nnet.softmax(
            att_weights_question
        )  # softmax over axis 1 -> length of question # --> lc*bs,lq
        att_weights_question = att_weights_question.reshape(
            (c_hidden.shape[0], q_hidden.shape[1],
             q_hidden.shape[0]))  # --> lc,bs,lq
        att_weights_question = att_weights_question.dimshuffle(
            0, 2, 1)  # --> lc,lq,bs

        question_context_attention = att_weights_question.dimshuffle(2, 1, 0)
        question_context_attention.name = "question_context_attention"

        self.analyse_vars = [question_context_attention]
        attended_question = tensor.sum(
            q_hidden[None, :, :, :] * att_weights_question[:, :, :, None],
            axis=1)  # sum over axis 1 -> length of question --> lc,bs,dim
        attended_question.name = 'attended_question'

        # Match LSTM
        cqembed = tensor.concatenate([c_hidden, attended_question], axis=2)
        mlstms, mhidden_list = make_bidir_lstm_stack(
            cqembed, 2 * config.pre_lstm_size,
            context_mask.astype(theano.config.floatX), config.match_lstm_size,
            config.match_skip_connections, 'match')
        bricks = bricks + mlstms
        if config.match_skip_connections:
            menc_dim = 2 * sum(config.match_lstm_size)
            menc = tensor.concatenate(mhidden_list, axis=2)
        else:
            menc_dim = 2 * config.match_lstm_size[-1]
            menc = tensor.concatenate(mhidden_list[-2:], axis=2)
        menc.name = 'menc'

        #pointer networks decoder LSTM and Attention parameters
        params = init_params(data_dim=config.decoder_data_dim,
                             lstm_dim=config.decoder_lstm_output_dim)
        tparams = init_tparams(params)

        self.theano_params = []
        add_role(tparams['lstm_de_W'], WEIGHT)
        add_role(tparams['lstm_de_U'], WEIGHT)
        add_role(tparams['lstm_de_b'], BIAS)
        add_role(tparams['ptr_b1'], BIAS)
        add_role(tparams['ptr_b2'], BIAS)
        add_role(tparams['ptr_v'], WEIGHT)
        add_role(tparams['ptr_W1'], WEIGHT)
        add_role(tparams['ptr_W2'], WEIGHT)
        self.theano_params = tparams.values()

        #n_steps = length , n_samples = batch_size
        n_steps = ans_indices.shape[0]
        n_samples = ans_indices.shape[1]
        preds, generations = ptr_network(
            tparams, cqembed, context_mask.astype(theano.config.floatX),
            ans_indices, ans_indices_mask.astype(theano.config.floatX),
            config.decoder_lstm_output_dim, menc)

        self.generations = generations

        idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'),
                                 tensor.ones((n_samples, ), dtype='int64'))
        idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'),
                                   tensor.arange(n_samples, dtype='int64'))
        probs = preds[idx_steps, ans_indices, idx_samples]
        # probs *= y_mask
        off = 1e-8
        if probs.dtype == 'float16':
            off = 1e-6
        # probs += (1 - y_mask)  # change unmasked position to 1, since log(1) = 0
        probs += off
        # probs_printed = theano.printing.Print('this is probs')(probs)
        cost = -tensor.log(probs)
        cost *= ans_indices_mask
        cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0)
        cost = cost.mean()

        # Apply dropout
        cg = ComputationGraph([cost])
        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, mhidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        # self.predictions.name = 'pred'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]
        # self.analyse_vars= [cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Пример #41
0
def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs):
    dataset_train = IterableDataset(
        generate_data(max_seq_length, batch_size, num_batches))
    dataset_test = IterableDataset(
        generate_data(max_seq_length, batch_size, 100))

    stream_train = DataStream(dataset=dataset_train)
    stream_test = DataStream(dataset=dataset_test)

    x = T.tensor3('x')
    y = T.matrix('y')

    # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see
    # LSTM layer documentation for the explanation
    x_to_h = Linear(1,
                    lstm_dim * 4,
                    name='x_to_h',
                    weights_init=IsotropicGaussian(),
                    biases_init=Constant(0.0))
    lstm = LSTM(lstm_dim,
                name='lstm',
                weights_init=IsotropicGaussian(),
                biases_init=Constant(0.0))
    h_to_o = Linear(lstm_dim,
                    1,
                    name='h_to_o',
                    weights_init=IsotropicGaussian(),
                    biases_init=Constant(0.0))

    x_transform = x_to_h.apply(x)
    h, c = lstm.apply(x_transform)

    # only values of hidden units of the last timeframe are used for
    # the classification
    y_hat = h_to_o.apply(h[-1])
    y_hat = Logistic().apply(y_hat)

    cost = BinaryCrossEntropy().apply(y, y_hat)
    cost.name = 'cost'

    lstm.initialize()
    x_to_h.initialize()
    h_to_o.initialize()

    cg = ComputationGraph(cost)

    algorithm = GradientDescent(cost=cost,
                                parameters=cg.parameters,
                                step_rule=Adam())
    test_monitor = DataStreamMonitoring(variables=[cost],
                                        data_stream=stream_test,
                                        prefix="test")
    train_monitor = TrainingDataMonitoring(variables=[cost],
                                           prefix="train",
                                           after_epoch=True)

    main_loop = MainLoop(algorithm,
                         stream_train,
                         extensions=[
                             test_monitor, train_monitor,
                             FinishAfter(after_n_epochs=num_epochs),
                             Printing(),
                             ProgressBar()
                         ])
    main_loop.run()

    print('Learned weights:')
    for layer in (x_to_h, lstm, h_to_o):
        print("Layer '%s':" % layer.name)
        for param in layer.parameters:
            print(param.name, ': ', param.get_value())
        print()

    return main_loop
Пример #42
0
def main():
    x = T.tensor3('features')
    m = T.matrix('features_mask')
    y = T.imatrix('targets')

    #rnn = SimpleRecurrent(
            #dim = 50,
            #activation=Tanh(),
            #weights_init = Uniform(std=0.01),
            #biases_init = Constant(0.)
        #)

    #rnn = GatedRecurrent(
            #dim = 50,
            #activation=Tanh(),
            #weights_init = Uniform(std=0.01),
            #biases_init = Constant(0.)
        #)

    embedding_size = 300
    #glove_version = "vectors.6B.100d.txt"
    glove_version = "glove.6B.300d.txt"
    #fork = Fork(weights_init=IsotropicGaussian(0.02),
            #biases_init=Constant(0.),
            #input_dim=embedding_size,
            #output_dims=[embedding_size]*3,
            #output_names=['inputs', 'reset_inputs', 'update_inputs']
            #)

    rnn = LSTM(
            dim = embedding_size,
            activation=Tanh(),
            weights_init = IsotropicGaussian(std=0.02),
        )
    rnn.initialize()

    #fork.initialize()
    wstd = 0.02

    score_layer = Linear(
            input_dim = 128,
            output_dim = 1,
            weights_init = IsotropicGaussian(std=wstd),
            biases_init = Constant(0.),
            name="linear2")
    score_layer.initialize()

    gloveMapping = Linear(
            input_dim = embedding_size,
            output_dim = embedding_size,
            weights_init = IsotropicGaussian(std=wstd),
            biases_init = Constant(0.0),
            name="gloveMapping"
            )
    gloveMapping.initialize()
    o = gloveMapping.apply(x)
    o = Rectifier(name="rectivfyglove").apply(o)

    forget_bias = np.zeros((embedding_size*4), dtype=theano.config.floatX)
    forget_bias[embedding_size:embedding_size*2] = 4.0
    toLSTM = Linear(
            input_dim = embedding_size,
            output_dim = embedding_size*4,
            weights_init = IsotropicGaussian(std=wstd),
            biases_init = Constant(forget_bias),
            #biases_init = Constant(0.0),
            name="ToLSTM"
            )
    toLSTM.initialize()


    rnn_states, rnn_cells = rnn.apply(toLSTM.apply(o) * T.shape_padright(m), mask=m)
    #inputs, reset_inputs, update_inputs = fork.apply(x)
    #rnn_states = rnn.apply(inputs=inputs, reset_inputs=reset_inputs, update_inputs=update_inputs, mask=m)

    #rnn_out = rnn_states[:, -1, :]
    rnn_out = (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1) / m.sum(axis=1).dimshuffle(0, 'x')
    #rnn_out = (rnn_states).mean(axis=1)# / m.sum(axis=1)

    hidden = Linear(
        input_dim = embedding_size,
        output_dim = 128,
        weights_init = Uniform(std=0.01),
        biases_init = Constant(0.))
    hidden.initialize()

    o = hidden.apply(rnn_out)
    o = Rectifier().apply(o)
    hidden = Linear(
        input_dim = 128,
        output_dim = 128,
        weights_init = IsotropicGaussian(std=0.02),
        biases_init = Constant(0.),
        name="hiddenmap2")
    hidden.initialize()

    o = hidden.apply(o)
    o = Rectifier(name="rec2").apply(o)

    o = score_layer.apply(o)

    probs = Sigmoid().apply(o)

    cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean()
    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'

    #print (rnn_states * m.dimshuffle(0, 1, 'x')).sum(axis=1).shape.eval(
            #{x : np.ones((45, 111, embedding_size), dtype=theano.config.floatX),
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).sum(axis=1).shape.eval({
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #print (m).shape.eval({
                #m : np.ones((45, 111), dtype=theano.config.floatX)})
    #raw_input()


    # =================

    cg = ComputationGraph([cost])
    params = cg.parameters

    algorithm = GradientDescent(
            cost = cost,
            params=params,
            step_rule = CompositeRule([
                StepClipping(threshold=10),
                AdaM(),
                #AdaDelta(),
                ])

            )


    # ========
    print "setting up data"

    train_dataset = IMDBText('train')
    test_dataset = IMDBText('test')
    batch_size = 16
    n_train = train_dataset.num_examples
    train_stream = DataStream(
            dataset=train_dataset,
            iteration_scheme=ShuffledScheme(
                examples=n_train,
                batch_size=batch_size)
            )
    glove = GloveTransformer(glove_version, data_stream=train_stream)

    train_padded = Padding(
            data_stream=glove,
            mask_sources=('features',)
            #mask_sources=[]
            )


    test_stream = DataStream(
            dataset=test_dataset,
            iteration_scheme=ShuffledScheme(
                examples=n_train,
                batch_size=batch_size)
            )
    glove = GloveTransformer(glove_version, data_stream=test_stream)
    test_padded = Padding(
            data_stream=glove,
            mask_sources=('features',)
            #mask_sources=[]
            )
    print "setting up model"
    #import ipdb
    #ipdb.set_trace()

    lstm_norm = rnn.W_state.norm(2)
    lstm_norm.name = "lstm_norm"

    pre_norm= gloveMapping.W.norm(2)
    pre_norm.name = "pre_norm"

    #======
    model = Model(cost)
    extensions = []
    extensions.append(EpochProgress(batch_per_epoch=train_dataset.num_examples // batch_size + 1))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification, lstm_norm, pre_norm],
        prefix='train',
        after_epoch=True
        ))

    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        data_stream=test_padded,
        prefix='test',
        after_epoch=True
        ))
    extensions.append(Timing())
    extensions.append(Printing())

    extensions.append(Plot("norms", channels=[['train_lstm_norm', 'train_pre_norm']], after_epoch=True))
    extensions.append(Plot("result", channels=[['train_cost', 'train_misclassification']], after_epoch=True))

    main_loop = MainLoop(
            model=model,
            data_stream=train_padded,
            algorithm=algorithm,
            extensions=extensions)
    main_loop.run()
Пример #43
0
    def build_theano_functions(self):
        x = T.fmatrix('time_sequence')
        x = x.reshape((self.batch_dim, self.sequence_dim, self.time_dim))

        y = x[:,1:self.sequence_dim,:]
        x = x[:,:self.sequence_dim-1,:]

        # if we try to include the spectrogram features
        spec_dims = 0
        if self.image_size is not None :
            print "Convolution activated"
            self.init_conv()
            spec = T.ftensor4('spectrogram')
            spec_features, spec_dims = self.conv.build_conv_layers(spec)
            print "Conv final dims =", spec_dims
            spec_dims = np.prod(spec_dims)
            spec_features = spec_features.reshape(
                (self.batch_dim, self.sequence_dim-1, spec_dims))
            x = T.concatenate([x, spec_features], axis=2)

        layers_input = [x]
        dims =np.array([self.time_dim + spec_dims])
        for dim in self.lstm_layers_dim :
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)) :

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(dims[layer],
                            dims[layer+1]*4,
                            weights_init=Orthogonal(self.orth_scale),
                            biases_init=Constant(0),
                            name="linear"+str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X sequence X time
            lstm = LSTM(
                dim=dims[layer+1],
                weights_init=IsotropicGaussian(mean=0.,std=0.5),
                biases_init=Constant(1),
                name="lstm"+str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(
                self.orth_scale*Orthogonal().generate(np.random, lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # this is where Alex Graves' paper starts
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(dims[1:].sum(),
                                  self.output_dim,
                                  weights_init=Orthogonal(self.orth_scale),
                                  use_bias=False,
                                  name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1 :
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else :
            y_hat = output_transform.apply(T.concatenate(layers_input[1:], axis=2))

        # transforms to find each gmm params (mu, pi, sig)
        # small hack to softmax a 3D tensor
        pis = T.reshape(
                    T.nnet.softmax(
                        T.reshape(y_hat[:,:,:self.gmm_dim], ((self.sequence_dim-1)*self.batch_dim, self.gmm_dim))),
                    (self.batch_dim, (self.sequence_dim-1), self.gmm_dim))
        sig = T.exp(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+1e-6
        mus = y_hat[:,:,self.gmm_dim*2:]

        pis = pis[:,:,:,np.newaxis]
        mus = mus[:,:,:,np.newaxis]
        sig = sig[:,:,:,np.newaxis]
        y = y[:,:,np.newaxis,:]

        y = T.patternbroadcast(y, (False, False, True, False))
        mus = T.patternbroadcast(mus, (False, False, False, True))
        sig = T.patternbroadcast(sig, (False, False, False, True))

        # sum likelihood with targets
        # see blog for this crazy Pr() = sum log sum prod
        # axes :: (batch, sequence, mixture, time)
        expo_term = -0.5*((y-mus)**2)/sig**2
        coeff = T.log(T.maximum(1./(T.sqrt(2.*np.pi)*sig), EPS))
        #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig))
        sequences = coeff + expo_term
        log_sequences = T.log(pis + EPS) + T.sum(sequences, axis=3, keepdims=True)

        log_sequences_max = T.max(log_sequences, axis=2, keepdims=True)

        LL = -(log_sequences_max + T.log(EPS + T.sum(T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))).mean()
        LL.name = "summed_likelihood"

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        algorithm = GradientDescent(
            cost=LL,
            parameters=model.parameters,
            step_rule=Adam())

        f = theano.function([x],[pis, sig, mus])

        return algorithm, f
Пример #44
0
    def __init__(self, config, vocab_size):
        question = tensor.imatrix('question')
        question_mask = tensor.imatrix('question_mask')
        answer = tensor.imatrix('answer')
        answer_mask = tensor.imatrix('answer_mask')
        better = tensor.imatrix('better')
        better_mask = tensor.imatrix('better_mask')
        worse = tensor.imatrix('worse')
        worse_mask = tensor.imatrix('worse_mask')
        b_left = tensor.imatrix('b_left')
        b_left_mask = tensor.imatrix('b_left_mask')
        b_right = tensor.imatrix('b_right')
        b_right_mask = tensor.imatrix('b_right_mask')
        w_left = tensor.imatrix('w_left')
        w_left_mask = tensor.imatrix('w_left_mask')
        w_right = tensor.imatrix('w_right')
        w_right_mask = tensor.imatrix('w_right_mask')


        bricks = []

        question = question.dimshuffle(1, 0)
        question_mask = question_mask.dimshuffle(1, 0)

        better = better.dimshuffle(1, 0)
        better_mask = better_mask.dimshuffle(1, 0)

        worse = worse.dimshuffle(1, 0)
        worse_mask = worse_mask.dimshuffle(1, 0)

        b_left = b_left.dimshuffle(1, 0)
        b_left_mask = b_left_mask.dimshuffle(1, 0)

        b_right = b_right.dimshuffle(1, 0)
        b_right_mask = b_right_mask.dimshuffle(1, 0)

        w_left = w_left.dimshuffle(1, 0)
        w_left_mask = w_left_mask.dimshuffle(1, 0)

        w_right = w_right.dimshuffle(1, 0)
        w_right_mask = w_right_mask.dimshuffle(1, 0)

        answer = answer.dimshuffle(1, 0)
        answer_mask = answer_mask.dimshuffle(1, 0)

        # Embed questions and context
        embed = LookupTable(vocab_size, config.embed_size, name='question_embed')
        embed.weights_init = IsotropicGaussian(0.01)

        # Calculate question encoding (concatenate layer1)
        qembed = embed.apply(question)
        qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX),
                                                     config.question_lstm_size, config.question_skip_connections, 'q')


        bricks = bricks + qlstms
        if config.question_skip_connections:
            qenc_dim = 2*sum(config.question_lstm_size)
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1)
        else:
            qenc_dim = 2*config.question_lstm_size[-1]
            qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1)
        qenc.name = 'qenc'

        # candidate encoders
        candidates_hidden_list = []

        candidate_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_fwd_lstm_in_0_0')
        candidate_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_fwd_lstm_0')

        candidate_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_bwd_lstm_in_0_0')
        candidate_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_bwd_lstm_0')

        #adding encoding bricks for initialization
        bricks = bricks + [candidate_fwd_lstm, candidate_bwd_lstm, candidate_fwd_lstm_ins, candidate_bwd_lstm_ins]

        #computing better encoding
        better_embed = embed.apply(better)
        better_fwd_tmp = candidate_fwd_lstm_ins.apply(better_embed)
        better_bwd_tmp = candidate_bwd_lstm_ins.apply(better_embed)
        better_fwd_hidden, _ = candidate_fwd_lstm.apply(better_fwd_tmp, mask=better_mask.astype(theano.config.floatX))
        better_bwd_hidden, _ = candidate_bwd_lstm.apply(better_bwd_tmp[::-1], mask=better_mask.astype(theano.config.floatX)[::-1])
        better_hidden_list = [better_fwd_hidden, better_bwd_hidden]
        better_enc_dim = 2*sum(config.ctx_lstm_size)
        better_enc = tensor.concatenate([h[-1,:,:] for h in better_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        better_enc.name = 'better_enc'
        candidates_hidden_list = candidates_hidden_list + [better_fwd_hidden, better_bwd_hidden]

        #computing worse encoding
        worse_embed = embed.apply(worse)
        worse_fwd_tmp = candidate_fwd_lstm_ins.apply(worse_embed)
        worse_bwd_tmp = candidate_bwd_lstm_ins.apply(worse_embed)
        worse_fwd_hidden, _ = candidate_fwd_lstm.apply(worse_fwd_tmp, mask=worse_mask.astype(theano.config.floatX))
        worse_bwd_hidden, _ = candidate_bwd_lstm.apply(worse_bwd_tmp[::-1], mask=worse_mask.astype(theano.config.floatX)[::-1])
        worse_hidden_list = [worse_fwd_hidden, worse_bwd_hidden]
        worse_enc_dim = 2*sum(config.ctx_lstm_size)
        worse_enc = tensor.concatenate([h[-1,:,:] for h in worse_hidden_list], axis=1)
        worse_enc.name = 'worse_enc'
        candidates_hidden_list = candidates_hidden_list + [worse_fwd_hidden, worse_bwd_hidden]


        #left encoders
        left_context_hidden_list = []

        left_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_fwd_lstm_in_0_0')
        left_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_fwd_lstm_0')

        left_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_bwd_lstm_in_0_0')
        left_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_bwd_lstm_0')

        #adding encoding bricks for initialization
        bricks = bricks + [left_context_fwd_lstm, left_context_bwd_lstm, left_context_fwd_lstm_ins, left_context_bwd_lstm_ins]

        #right encoders
        right_context_hidden_list = []

        right_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_fwd_lstm_in_0_0')
        right_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_fwd_lstm_0')

        right_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_bwd_lstm_in_0_0')
        right_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_bwd_lstm_0')

        #adding encoding bricks for initialization
        bricks = bricks + [right_context_fwd_lstm, right_context_bwd_lstm, right_context_fwd_lstm_ins, right_context_bwd_lstm_ins]


        #left half encodings
        better_left_embed = embed.apply(b_left)
        better_left_fwd_tmp = left_context_fwd_lstm_ins.apply(better_left_embed)
        better_left_bwd_tmp = left_context_bwd_lstm_ins.apply(better_left_embed)
        better_left_fwd_hidden, _ = left_context_fwd_lstm.apply(better_left_fwd_tmp, mask=b_left_mask.astype(theano.config.floatX))
        better_left_bwd_hidden, _ = left_context_bwd_lstm.apply(better_left_bwd_tmp[::-1], mask=b_left_mask.astype(theano.config.floatX)[::-1])
        better_left_hidden_list = [better_left_fwd_hidden, better_left_bwd_hidden]
        better_left_enc_dim = 2*sum(config.ctx_lstm_size)
        better_left_enc = tensor.concatenate([h[-1,:,:] for h in better_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        better_left_enc.name = 'better_left_enc'
        left_context_hidden_list = left_context_hidden_list + [better_left_fwd_hidden, better_left_bwd_hidden]

        worse_left_embed = embed.apply(w_left)
        worse_left_fwd_tmp = left_context_fwd_lstm_ins.apply(worse_left_embed)
        worse_left_bwd_tmp = left_context_bwd_lstm_ins.apply(worse_left_embed)
        worse_left_fwd_hidden, _ = left_context_fwd_lstm.apply(worse_left_fwd_tmp, mask=w_left_mask.astype(theano.config.floatX))
        worse_left_bwd_hidden, _ = left_context_bwd_lstm.apply(worse_left_bwd_tmp[::-1], mask=w_left_mask.astype(theano.config.floatX)[::-1])
        worse_left_hidden_list = [worse_left_fwd_hidden, worse_left_bwd_hidden]
        worse_left_enc_dim = 2*sum(config.ctx_lstm_size)
        worse_left_enc = tensor.concatenate([h[-1,:,:] for h in worse_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        worse_left_enc.name = 'worse_left_enc'
        left_context_hidden_list = left_context_hidden_list + [worse_left_fwd_hidden, worse_left_bwd_hidden]


        #right half encoding
        better_right_embed = embed.apply(b_right)
        better_right_fwd_tmp = right_context_fwd_lstm_ins.apply(better_right_embed)
        better_right_bwd_tmp = right_context_bwd_lstm_ins.apply(better_right_embed)
        better_right_fwd_hidden, _ = right_context_fwd_lstm.apply(better_right_fwd_tmp, mask=b_right_mask.astype(theano.config.floatX))
        better_right_bwd_hidden, _ = right_context_bwd_lstm.apply(better_right_bwd_tmp[::-1], mask=b_right_mask.astype(theano.config.floatX)[::-1])
        better_right_hidden_list = [better_right_fwd_hidden, better_right_bwd_hidden]
        better_right_enc_dim = 2*sum(config.ctx_lstm_size)
        better_right_enc = tensor.concatenate([h[-1,:,:] for h in better_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        better_right_enc.name = 'better_right_enc'
        right_context_hidden_list = right_context_hidden_list + [better_right_fwd_hidden, better_right_bwd_hidden]

        worse_right_embed = embed.apply(w_right)
        worse_right_fwd_tmp = right_context_fwd_lstm_ins.apply(worse_right_embed)
        worse_right_bwd_tmp = right_context_bwd_lstm_ins.apply(worse_right_embed)
        worse_right_fwd_hidden, _ = right_context_fwd_lstm.apply(worse_right_fwd_tmp, mask=w_right_mask.astype(theano.config.floatX))
        worse_right_bwd_hidden, _ = right_context_bwd_lstm.apply(worse_right_bwd_tmp[::-1], mask=w_right_mask.astype(theano.config.floatX)[::-1])
        worse_right_hidden_list = [worse_right_fwd_hidden, worse_right_bwd_hidden]
        worse_right_enc_dim = 2*sum(config.ctx_lstm_size)
        worse_right_enc = tensor.concatenate([h[-1,:,:] for h in worse_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size
        worse_right_enc.name = 'worse_right_enc'
        right_context_hidden_list = right_context_hidden_list + [worse_right_fwd_hidden, worse_right_bwd_hidden]


        # F1 prediction MLP
        prediction_mlp = MLP(dims=config.prediction_mlp_hidden + [1],
                             activations=config.prediction_mlp_activations[1:] + [Identity()],
                             name='prediction_mlp')

        prediction_qlinear = Linear(input_dim=qenc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, name='preq')
        prediction_cand_linear = Linear(input_dim=worse_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='precand')
        prediction_left_half_linear = Linear(input_dim=better_left_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preleft')
        prediction_right_half_linear = Linear(input_dim=better_right_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preright')
        bricks += [prediction_mlp, prediction_qlinear, prediction_cand_linear, prediction_left_half_linear, prediction_right_half_linear]
        better_layer1 = Tanh('tan1').apply(tensor.concatenate([prediction_cand_linear.apply(better_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(better_left_enc), prediction_right_half_linear.apply(better_right_enc)],axis=1))
        better_layer1.name = 'better_layer1'

        worse_layer1 = Tanh('tan2').apply(tensor.concatenate([prediction_cand_linear.apply(worse_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(worse_left_enc), prediction_right_half_linear.apply(worse_right_enc)],axis=1))
        worse_layer1.name = 'worse_layer1'



        better_pred_weights = Tanh('rec1').apply(prediction_mlp.apply(better_layer1)) #batch_size
        worse_pred_weights = Tanh('rec2').apply(prediction_mlp.apply(worse_layer1)) #batch_size

        # numpy.set_printoptions(edgeitems=500)
        # better_pred_weights = theano.printing.Print('better')(better_pred_weights)
        # worse_pred_weights = theano.printing.Print('better')(worse_pred_weights)
        # #cost : max(0,- score-better + score-worse + margin)
        margin = config.margin
        conditions = tensor.lt(better_pred_weights, worse_pred_weights + margin).astype(theano.config.floatX)
        self.predictions = conditions
        cost = (-better_pred_weights + worse_pred_weights + margin) * conditions
        cost = cost.mean()

        # Apply dropout
        cg = ComputationGraph([cost])

        if config.w_noise > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, config.w_noise)
        if config.dropout > 0:
            cg = apply_dropout(cg, qhidden_list + candidates_hidden_list, config.dropout)
        [cost_reg] = cg.outputs

        # Other stuff
        cost.name = 'cost'
        cost_reg.name = 'cost_reg'

        self.sgd_cost = cost_reg
        self.monitor_vars = [[cost_reg]]
        self.monitor_vars_valid = [[cost_reg]]

        # Initialize bricks
        embed.initialize()
        for brick in bricks:
            brick.weights_init = config.weights_init
            brick.biases_init = config.biases_init
            brick.initialize()
Пример #45
0
class Model(Initializable):
    @lazy()
    def __init__(self, config, **kwargs):
        super(Model, self).__init__(**kwargs)
        self.config = config

        self.pre_context_embedder = ContextEmbedder(
            config.pre_embedder, name='pre_context_embedder')
        self.post_context_embedder = ContextEmbedder(
            config.post_embedder, name='post_context_embedder')

        in1 = 2 + sum(x[2] for x in config.pre_embedder.dim_embeddings)
        self.input_to_rec = MLP(activations=[Tanh()],
                                dims=[in1, config.hidden_state_dim],
                                name='input_to_rec')

        self.rec = LSTM(dim=config.hidden_state_dim, name='recurrent')

        in2 = config.hidden_state_dim + sum(
            x[2] for x in config.post_embedder.dim_embeddings)
        self.rec_to_output = MLP(activations=[Tanh()],
                                 dims=[in2, 2],
                                 name='rec_to_output')

        self.sequences = ['latitude', 'latitude_mask', 'longitude']
        self.context = self.pre_context_embedder.inputs + self.post_context_embedder.inputs
        self.inputs = self.sequences + self.context
        self.children = [
            self.pre_context_embedder, self.post_context_embedder,
            self.input_to_rec, self.rec, self.rec_to_output
        ]

        self.initial_state_ = shared_floatx_zeros((config.hidden_state_dim, ),
                                                  name="initial_state")
        self.initial_cells = shared_floatx_zeros((config.hidden_state_dim, ),
                                                 name="initial_cells")

    def _push_initialization_config(self):
        for mlp in [self.input_to_rec, self.rec_to_output]:
            mlp.weights_init = self.config.weights_init
            mlp.biases_init = self.config.biases_init
        self.rec.weights_init = self.config.weights_init

    def get_dim(self, name):
        return self.rec.get_dim(name)

    @application
    def initial_state(self, *args, **kwargs):
        return self.rec.initial_state(*args, **kwargs)

    @recurrent(states=['states', 'cells'],
               outputs=['destination', 'states', 'cells'],
               sequences=['latitude', 'longitude', 'latitude_mask'])
    def predict_all(self, latitude, longitude, latitude_mask, **kwargs):
        latitude = (latitude - data.train_gps_mean[0]) / data.train_gps_std[0]
        longitude = (longitude -
                     data.train_gps_mean[1]) / data.train_gps_std[1]

        pre_emb = tuple(self.pre_context_embedder.apply(**kwargs))
        latitude = tensor.shape_padright(latitude)
        longitude = tensor.shape_padright(longitude)
        itr = self.input_to_rec.apply(
            tensor.concatenate(pre_emb + (latitude, longitude), axis=1))
        itr = itr.repeat(4, axis=1)
        (next_states, next_cells) = self.rec.apply(itr,
                                                   kwargs['states'],
                                                   kwargs['cells'],
                                                   mask=latitude_mask,
                                                   iterate=False)

        post_emb = tuple(self.post_context_embedder.apply(**kwargs))
        rto = self.rec_to_output.apply(
            tensor.concatenate(post_emb + (next_states, ), axis=1))

        rto = (rto * data.train_gps_std) + data.train_gps_mean
        return (rto, next_states, next_cells)

    @predict_all.property('contexts')
    def predict_all_inputs(self):
        return self.context

    @application(outputs=['destination'])
    def predict(self, latitude, longitude, latitude_mask, **kwargs):
        latitude = latitude.T
        longitude = longitude.T
        latitude_mask = latitude_mask.T
        res = self.predict_all(latitude, longitude, latitude_mask, **kwargs)[0]
        return res[-1]

    @predict.property('inputs')
    def predict_inputs(self):
        return self.inputs

    @application(outputs=['cost_matrix'])
    def cost_matrix(self, latitude, longitude, latitude_mask, **kwargs):
        latitude = latitude.T
        longitude = longitude.T
        latitude_mask = latitude_mask.T

        res = self.predict_all(latitude, longitude, latitude_mask, **kwargs)[0]
        target = tensor.concatenate(
            (kwargs['destination_latitude'].dimshuffle('x', 0, 'x'),
             kwargs['destination_longitude'].dimshuffle('x', 0, 'x')),
            axis=2)
        target = target.repeat(latitude.shape[0], axis=0)
        ce = error.erdist(target.reshape((-1, 2)), res.reshape((-1, 2)))
        ce = ce.reshape(latitude.shape)
        return ce * latitude_mask

    @cost_matrix.property('inputs')
    def cost_matrix_inputs(self):
        return self.inputs + ['destination_latitude', 'destination_longitude']

    @application(outputs=['cost'])
    def cost(self, latitude_mask, **kwargs):
        return self.cost_matrix(latitude_mask=latitude_mask, **
                                kwargs).sum() / latitude_mask.sum()

    @cost.property('inputs')
    def cost_inputs(self):
        return self.inputs + ['destination_latitude', 'destination_longitude']

    @application(outputs=['cost'])
    def valid_cost(self, **kwargs):
        # Only works when batch_size is 1.
        return self.cost_matrix(**kwargs)[-1, 0]

    @valid_cost.property('inputs')
    def valid_cost_inputs(self):
        return self.inputs + ['destination_latitude', 'destination_longitude']
Пример #46
0
class Model(Initializable):
    @lazy()
    def __init__(self, config, **kwargs):
        super(Model, self).__init__(**kwargs)
        self.config = config

        self.pre_context_embedder = ContextEmbedder(config.pre_embedder, name='pre_context_embedder')
        self.post_context_embedder = ContextEmbedder(config.post_embedder, name='post_context_embedder')

        in1 = 2 + sum(x[2] for x in config.pre_embedder.dim_embeddings)
        self.input_to_rec = MLP(activations=[Tanh()], dims=[in1, config.hidden_state_dim], name='input_to_rec')

        self.rec = LSTM(
                dim = config.hidden_state_dim,
                name = 'recurrent'
            )

        in2 = config.hidden_state_dim + sum(x[2] for x in config.post_embedder.dim_embeddings)
        self.rec_to_output = MLP(activations=[Tanh()], dims=[in2, 2], name='rec_to_output')

        self.sequences = ['latitude', 'latitude_mask', 'longitude']
        self.context = self.pre_context_embedder.inputs + self.post_context_embedder.inputs
        self.inputs = self.sequences + self.context
        self.children = [ self.pre_context_embedder, self.post_context_embedder, self.input_to_rec, self.rec, self.rec_to_output ]

        self.initial_state_ = shared_floatx_zeros((config.hidden_state_dim,),
                name="initial_state")
        self.initial_cells = shared_floatx_zeros((config.hidden_state_dim,),
                name="initial_cells")

    def _push_initialization_config(self):
        for mlp in [self.input_to_rec, self.rec_to_output]:
            mlp.weights_init = self.config.weights_init
            mlp.biases_init = self.config.biases_init
        self.rec.weights_init = self.config.weights_init

    def get_dim(self, name):
        return self.rec.get_dim(name)

    @application
    def initial_state(self, *args, **kwargs):
        return self.rec.initial_state(*args, **kwargs)

    @recurrent(states=['states', 'cells'], outputs=['destination', 'states', 'cells'], sequences=['latitude', 'longitude', 'latitude_mask'])
    def predict_all(self, latitude, longitude, latitude_mask, **kwargs):
        latitude = (latitude - data.train_gps_mean[0]) / data.train_gps_std[0]
        longitude = (longitude - data.train_gps_mean[1]) / data.train_gps_std[1]

        pre_emb = tuple(self.pre_context_embedder.apply(**kwargs))
        latitude = tensor.shape_padright(latitude)
        longitude = tensor.shape_padright(longitude)
        itr = self.input_to_rec.apply(tensor.concatenate(pre_emb + (latitude, longitude), axis=1))
        itr = itr.repeat(4, axis=1)
        (next_states, next_cells) = self.rec.apply(itr, kwargs['states'], kwargs['cells'], mask=latitude_mask, iterate=False)

        post_emb = tuple(self.post_context_embedder.apply(**kwargs))
        rto = self.rec_to_output.apply(tensor.concatenate(post_emb + (next_states,), axis=1))

        rto = (rto * data.train_gps_std) + data.train_gps_mean
        return (rto, next_states, next_cells)

    @predict_all.property('contexts')
    def predict_all_inputs(self):
        return self.context

    @application(outputs=['destination'])
    def predict(self, latitude, longitude, latitude_mask, **kwargs):
        latitude = latitude.T
        longitude = longitude.T
        latitude_mask = latitude_mask.T
        res = self.predict_all(latitude, longitude, latitude_mask, **kwargs)[0]
        return res[-1]

    @predict.property('inputs')
    def predict_inputs(self):
        return self.inputs

    @application(outputs=['cost_matrix'])
    def cost_matrix(self, latitude, longitude, latitude_mask, **kwargs):
        latitude = latitude.T
        longitude = longitude.T
        latitude_mask = latitude_mask.T

        res = self.predict_all(latitude, longitude, latitude_mask, **kwargs)[0]
        target = tensor.concatenate(
                    (kwargs['destination_latitude'].dimshuffle('x', 0, 'x'),
                     kwargs['destination_longitude'].dimshuffle('x', 0, 'x')),
                axis=2)
        target = target.repeat(latitude.shape[0], axis=0)
        ce = error.erdist(target.reshape((-1, 2)), res.reshape((-1, 2)))
        ce = ce.reshape(latitude.shape)
        return ce * latitude_mask

    @cost_matrix.property('inputs')
    def cost_matrix_inputs(self):
        return self.inputs + ['destination_latitude', 'destination_longitude']

    @application(outputs=['cost'])
    def cost(self, latitude_mask, **kwargs):
        return self.cost_matrix(latitude_mask=latitude_mask, **kwargs).sum() / latitude_mask.sum()

    @cost.property('inputs')
    def cost_inputs(self):
        return self.inputs + ['destination_latitude', 'destination_longitude']

    @application(outputs=['cost'])
    def valid_cost(self, **kwargs):
        # Only works when batch_size is 1.
        return self.cost_matrix(**kwargs)[-1,0]

    @valid_cost.property('inputs')
    def valid_cost_inputs(self):
        return self.inputs + ['destination_latitude', 'destination_longitude']
Пример #47
0
def lstm_layer(dim, h, n):
    linear = Linear(input_dim=dim, output_dim=dim * 4, name='linear' + str(n))
    lstm = LSTM(dim=dim, name='lstm' + str(n))
    initialize([linear, lstm])
    return lstm.apply(linear.apply(h))[0]
Пример #48
0
def main():
    x = T.imatrix('features')
    m = T.matrix('features_mask')
    y = T.imatrix('targets')
    #x_int = x.astype(dtype='int32').T
    x_int = x.T

    train_dataset = IMDB('train')
    n_voc = len(train_dataset.dict.keys())
    n_h = 2
    lookup = LookupTable(
            length=n_voc+2,
            dim = n_h*4,
            weights_init = Uniform(std=0.01),
            biases_init = Constant(0.)
        )
    lookup.initialize()

    #rnn = SimpleRecurrent(
            #dim = n_h,
            #activation=Tanh(),
            #weights_init = Uniform(std=0.01),
            #biases_init = Constant(0.)
        #)
    rnn = LSTM(
            dim = n_h,
            activation=Tanh(),
            weights_init = Uniform(std=0.01),
            biases_init = Constant(0.)
        )

    rnn.initialize()
    score_layer = Linear(
            input_dim = n_h,
            output_dim = 1,
            weights_init = Uniform(std=0.01),
            biases_init = Constant(0.))
    score_layer.initialize()

    embedding = lookup.apply(x_int) * T.shape_padright(m.T)
    #embedding = lookup.apply(x_int) + m.T.mean()*0
    #embedding = lookup.apply(x_int) + m.T.mean()*0

    rnn_states = rnn.apply(embedding, mask=m.T)
    #rnn_states, rnn_cells = rnn.apply(embedding)
    rnn_out_mean_pooled = rnn_states[-1]
    #rnn_out_mean_pooled = rnn_states.mean()

    probs = Sigmoid().apply(
        score_layer.apply(rnn_out_mean_pooled))

    cost = - (y * T.log(probs) + (1-y) * T.log(1 - probs)).mean()
    cost.name = 'cost'
    misclassification = (y * (probs < 0.5) + (1-y) * (probs > 0.5)).mean()
    misclassification.name = 'misclassification'


    # =================

    cg = ComputationGraph([cost])
    params = cg.parameters
    algorithm = GradientDescent(
            cost = cost,
            params=params,
            step_rule = CompositeRule([
                StepClipping(threshold=10),
                Adam(),
                #AdaDelta(),
                ])

            )


    # ========

    test_dataset = IMDB('test')
    batch_size = 64
    n_train = train_dataset.num_examples
    train_stream = DataStream(
            dataset=train_dataset,
            iteration_scheme=ShuffledScheme(
                examples=n_train,
                batch_size=batch_size)
            )
    train_padded = Padding(
            data_stream=train_stream,
            mask_sources=('features',)
            #mask_sources=[]
            )


    test_stream = DataStream(
            dataset=test_dataset,
            iteration_scheme=ShuffledScheme(
                examples=n_train,
                batch_size=batch_size)
            )
    test_padded = Padding(
            data_stream=test_stream,
            mask_sources=('features',)
            #mask_sources=[]
            )
    #import ipdb
    #ipdb.set_trace()

    #======
    model = Model(cost)
    extensions = []
    extensions.append(EpochProgress(batch_per_epoch=train_dataset.num_examples // batch_size + 1))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification],
        prefix='train',
        after_epoch=True
        ))

    extensions.append(DataStreamMonitoring(
        [cost, misclassification],
        data_stream=test_padded,
        prefix='test',
        after_epoch=True
        ))
    extensions.append(Timing())
    extensions.append(Printing())

    main_loop = MainLoop(
            model=model,
            data_stream=train_padded,
            algorithm=algorithm,
            extensions=extensions)
    main_loop.run()
Пример #49
0
    def build_theano_functions(self):
        x = T.ftensor3('x')  # shape of input : batch X time X value
        y = T.ftensor4('y')

        layers_input = [x]
        dims = np.array([self.time_dim])
        for dim in self.lstm_layers_dim:
            dims = np.append(dims, dim)
        print "Dimensions =", dims

        # layer is just an index of the layer
        for layer in range(len(self.lstm_layers_dim)):

            # before the cell, input, forget and output gates, x needs to
            # be transformed
            linear = Linear(
                dims[layer],
                dims[layer + 1] * 4,
                weights_init=Orthogonal(self.orth_scale),
                #weights_init=IsotropicGaussian(mean=1.,std=1),
                biases_init=Constant(0),
                name="linear" + str(layer))
            linear.initialize()
            lstm_input = linear.apply(layers_input[layer])

            # the lstm wants batch X time X value
            lstm = LSTM(dim=dims[layer + 1],
                        weights_init=IsotropicGaussian(mean=0., std=0.5),
                        biases_init=Constant(1),
                        name="lstm" + str(layer))
            lstm.initialize()
            # hack to use Orthogonal on lstm w_state
            lstm.W_state.set_value(
                self.orth_scale *
                Orthogonal().generate(np.random,
                                      lstm.W_state.get_value().shape))
            h, _dummy = lstm.apply(lstm_input)

            layers_input.append(h)

        # this is where Alex Graves' paper starts
        print "Last linear transform dim :", dims[1:].sum()
        output_transform = Linear(
            dims[1:].sum(),
            self.output_dim,
            weights_init=Orthogonal(self.orth_scale),
            #weights_init=IsotropicGaussian(mean=0., std=1),
            use_bias=False,
            name="output_transform")
        output_transform.initialize()
        if len(self.lstm_layers_dim) == 1:
            print "hallo there, only one layer speaking"
            y_hat = output_transform.apply(layers_input[-1])
        else:
            y_hat = output_transform.apply(
                T.concatenate(layers_input[1:], axis=2))

        # transforms to find each gmm params (mu, pi, sig)
        # small hack to softmax a 3D tensor
        #pis = T.reshape(
        #            T.nnet.softmax(
        #                T.nnet.sigmoid(
        #                    T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim)))),
        #            (self.batch_dim, self.time_dim, self.gmm_dim))
        pis = T.reshape(
            T.nnet.softmax(
                T.reshape(y_hat[:, :, :self.gmm_dim],
                          (self.sequence_dim * self.batch_dim, self.gmm_dim))),
            (self.batch_dim, self.sequence_dim, self.gmm_dim))
        sig = T.exp(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 1e-6
        #sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1
        #mus = 2.*T.tanh(y_hat[:,:,self.gmm_dim*2:])
        mus = y_hat[:, :, self.gmm_dim * 2:]

        pis = pis[:, :, :, np.newaxis]
        mus = mus[:, :, :, np.newaxis]
        sig = sig[:, :, :, np.newaxis]
        #y = y[:,:,np.newaxis,:]

        y = T.patternbroadcast(y, (False, False, True, False))
        mus = T.patternbroadcast(mus, (False, False, False, True))
        sig = T.patternbroadcast(sig, (False, False, False, True))

        # sum likelihood with targets
        # see blog for this crazy Pr() = sum log sum prod
        # axes :: (batch, sequence, mixture, time)
        expo_term = -0.5 * ((y - mus)**2) / sig**2
        coeff = T.log(T.maximum(1. / (T.sqrt(2. * np.pi) * sig), EPS))
        #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig))
        sequences = coeff + expo_term
        log_sequences = T.log(pis + EPS) + T.sum(
            sequences, axis=3, keepdims=True)

        log_sequences_max = T.max(log_sequences, axis=2, keepdims=True)

        LL = -(log_sequences_max + T.log(EPS + T.sum(
            T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True))
               ).mean()

        model = Model(LL)
        self.model = model
        parameters = model.parameters

        grads = T.grad(LL, parameters)
        updates = []
        lr = T.scalar('lr')
        for i in range(len(grads)):
            #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]]))
            updates.append(
                tuple([parameters[i], parameters[i] - lr * grads[i]]))

        #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False))
        if self.debug:
            gradf = theano.function([x, y, lr], [LL, pis, mus, sig],
                                    updates=updates)
        else:
            #gradf = theano.function([x, y, z],[zLL],updates=updates)
            gradf = theano.function([x, y, lr], [LL], updates=updates)
        f = theano.function([x], [pis, sig, mus])

        return gradf, f
Пример #50
0
def main(num_epochs=100):
    x = tensor.matrix('features')
    m = tensor.matrix('features_mask')

    x_int = x.astype(dtype='int32').T
    train_dataset = TextFile('inspirational.txt')
    train_dataset.indexables[0] = numpy.array(sorted(
        train_dataset.indexables[0], key=len
    ))

    n_voc = len(train_dataset.dict.keys())

    init_probs = numpy.array(
        [sum(filter(lambda idx:idx == w,
                    [s[0] for s in train_dataset.indexables[
                        train_dataset.sources.index('features')]]
                    )) for w in xrange(n_voc)],
        dtype=theano.config.floatX
    )
    init_probs = init_probs / init_probs.sum()

    n_h = 100
    linear_embedding = LookupTable(
        length=n_voc,
        dim=4*n_h,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    linear_embedding.initialize()
    
    rnn = LSTM(
        dim=n_h,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    rnn.initialize()
    score_layer = Linear(
        input_dim=n_h,
        output_dim=n_voc,
        weights_init=Uniform(std=0.01),
        biases_init=Constant(0.)
    )
    score_layer.initialize()

    embedding = (linear_embedding.apply(x_int[:-1])
                 * tensor.shape_padright(m.T[1:]))
    rnn_out = rnn.apply(inputs=embedding, mask=m.T[1:])
    probs = softmax(
        sequence_map(score_layer.apply, rnn_out[0], mask=m.T[1:])[0]
    )
    idx_mask = m.T[1:].nonzero()
    cost = CategoricalCrossEntropy().apply(
        x_int[1:][idx_mask[0], idx_mask[1]],
        probs[idx_mask[0], idx_mask[1]]
    )
    cost.name = 'cost'
    misclassification = MisclassificationRate().apply(
        x_int[1:][idx_mask[0], idx_mask[1]],
        probs[idx_mask[0], idx_mask[1]]
    )
    misclassification.name = 'misclassification'

    cg = ComputationGraph([cost])
    params = cg.parameters

    algorithm = GradientDescent(
        cost=cost,
        params=params,
        step_rule=CompositeRule(
            [StepClipping(10.),
             Adam()])
    )

    train_data_stream = Padding(
        data_stream=DataStream(
            dataset=train_dataset,
            iteration_scheme=BatchwiseShuffledScheme(
                examples=train_dataset.num_examples,
                batch_size=10,
            )
        ),
        mask_sources=('features',)
    )

    model = Model(cost)

    extensions = []
    extensions.append(Timing())
    extensions.append(FinishAfter(after_n_epochs=num_epochs))
    extensions.append(TrainingDataMonitoring(
        [cost, misclassification],
        prefix='train',
        after_epoch=True))

    batch_size = 10
    length = 30
    trng = MRG_RandomStreams(18032015)
    u = trng.uniform(size=(length, batch_size, n_voc))
    gumbel_noise = -tensor.log(-tensor.log(u))
    init_samples = (tensor.log(init_probs).dimshuffle(('x', 0))
                    + gumbel_noise[0]).argmax(axis=-1)
    init_states = rnn.initial_state('states', batch_size)
    init_cells = rnn.initial_state('cells', batch_size)

    def sampling_step(g_noise, states, cells, samples_step):
        embedding_step = linear_embedding.apply(samples_step)
        next_states, next_cells = rnn.apply(inputs=embedding_step,
                                            states=states,
                                            cells=cells,
                                            iterate=False)
        probs_step = softmax(score_layer.apply(next_states))
        next_samples = (tensor.log(probs_step)
                        + g_noise).argmax(axis=-1)

        return next_states, next_cells, next_samples

    [_, _, samples], _ = theano.scan(
        fn=sampling_step,
        sequences=[gumbel_noise[1:]],
        outputs_info=[init_states, init_cells, init_samples]
    )

    sampling = theano.function([], samples.owner.inputs[0].T)

    plotters = []
    plotters.append(Plotter(
        channels=[['train_cost', 'train_misclassification']],
        titles=['Costs']))

    extensions.append(PlotManager('Language modelling example',
                                  plotters=plotters,
                                  after_epoch=True,
                                  after_training=True))
    extensions.append(Printing())
    extensions.append(PrintSamples(sampler=sampling,
                                   voc=train_dataset.inv_dict))

    main_loop = MainLoop(model=model,
                         data_stream=train_data_stream,
                         algorithm=algorithm,
                         extensions=extensions)

    main_loop.run()
Пример #51
0
class Seq2Seq(Initializable):
    """ seq2seq model

    Parameters
    ----------
    emb_dim: int
        The dimension of word embeddings (including for def model if standalone)
    dim : int
        The dimension of the RNNs states (including for def model if standalone)
    num_input_words : int
        The size of the LM's input vocabulary.
    num_output_words : int
        The size of the LM's output vocabulary.
    vocab
        The vocabulary object.
    """
    def __init__(self,
                 emb_dim,
                 dim,
                 num_input_words,
                 num_output_words,
                 vocab,
                 proximity_coef=0,
                 proximity_distance='l2',
                 encoder='lstm',
                 decoder='lstm',
                 shared_rnn=False,
                 translate_layer=None,
                 word_dropout=0.,
                 tied_in_out=False,
                 vocab_keys=None,
                 seed=0,
                 reconstruction_coef=1.,
                 provide_targets=False,
                 **kwargs):
        """
        translate_layer: either a string containing the activation function to use
                         either a list containg the list of activations for a MLP
        """
        if emb_dim == 0:
            emb_dim = dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if num_output_words == 0:
            num_output_words = vocab.size()

        self._word_dropout = word_dropout

        self._tied_in_out = tied_in_out

        if not encoder:
            if proximity_coef:
                raise ValueError("Err: meaningless penalty term (no encoder)")
            if not vocab_keys:
                raise ValueError("Err: specify a key vocabulary (no encoder)")

        if tied_in_out and num_input_words != num_output_words:
            raise ValueError("Can't tie in and out embeddings. Different "
                             "vocabulary size")
        if shared_rnn and (encoder != 'lstm' or decoder != 'lstm'):
            raise ValueError(
                "can't share RNN because either encoder or decoder"
                "is not an RNN")
        if shared_rnn and decoder == 'lstm_c':
            raise ValueError(
                "can't share RNN because the decoder takes different"
                "inputs")
        if word_dropout < 0 or word_dropout > 1:
            raise ValueError("invalid value for word dropout",
                             str(word_dropout))
        if proximity_distance not in ['l1', 'l2', 'cos']:
            raise ValueError(
                "unrecognized distance: {}".format(proximity_distance))

        if proximity_coef and emb_dim != dim and not translate_layer:
            raise ValueError(
                """if proximity penalisation, emb_dim should equal dim or 
                              there should be a translate layer""")

        if encoder not in [
                None, 'lstm', 'bilstm', 'mean', 'weighted_mean', 'max_bilstm',
                'bilstm_sum', 'max_bilstm_sum'
        ]:
            raise ValueError('encoder not recognized')
        if decoder not in ['skip-gram', 'lstm', 'lstm_c']:
            raise ValueError('decoder not recognized')

        self._proximity_distance = proximity_distance
        self._decoder = decoder
        self._encoder = encoder
        self._num_input_words = num_input_words
        self._num_output_words = num_output_words
        self._vocab = vocab
        self._proximity_coef = proximity_coef
        self._reconstruction_coef = reconstruction_coef
        self._provide_targets = provide_targets

        self._word_to_id = WordToIdOp(self._vocab)
        if vocab_keys:
            self._key_to_id = WordToIdOp(vocab_keys)

        children = []

        if encoder or (not encoder and decoder in ['lstm', 'lstm_c']):
            self._main_lookup = LookupTable(self._num_input_words,
                                            emb_dim,
                                            name='main_lookup')
            children.append(self._main_lookup)
        if provide_targets:
            # this is useful to simulate Hill's baseline without pretrained embeddings
            # in the encoder, only as targets for the encoder.
            self._target_lookup = LookupTable(self._num_input_words,
                                              emb_dim,
                                              name='target_lookup')
            children.append(self._target_lookup)
        if not encoder:
            self._key_lookup = LookupTable(vocab_keys.size(),
                                           emb_dim,
                                           name='key_lookup')
            children.append(self._key_lookup)
        elif encoder == 'lstm':
            self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
            self._encoder_rnn = LSTM(dim, name='encoder_rnn')
            children.extend([self._encoder_fork, self._encoder_rnn])
        elif encoder in ['bilstm', 'max_bilstm']:
            # dim is the dim of the concatenated vector
            self._encoder_fork = Linear(emb_dim, 2 * dim, name='encoder_fork')
            self._encoder_rnn = Bidirectional(LSTM(dim / 2,
                                                   name='encoder_rnn'))
            children.extend([self._encoder_fork, self._encoder_rnn])
        elif encoder in ['bilstm_sum', 'max_bilstm_sum']:
            self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
            self._encoder_rnn = BidirectionalSum(LSTM(dim, name='encoder_rnn'))
            children.extend([self._encoder_fork, self._encoder_rnn])
        elif encoder == 'mean':
            pass
        elif encoder == 'weighted_mean':
            self._encoder_w = MLP([Logistic()], [dim, 1],
                                  name="encoder_weights")
            children.extend([self._encoder_w])
        else:
            raise NotImplementedError()

        if decoder in ['lstm', 'lstm_c']:
            dim_after_translate = emb_dim
            if shared_rnn:
                self._decoder_fork = self._encoder_fork
                self._decoder_rnn = self._encoder_rnn
            else:
                if decoder == 'lstm_c':
                    dim_2 = dim + emb_dim
                else:
                    dim_2 = dim
                self._decoder_fork = Linear(dim_2,
                                            4 * dim,
                                            name='decoder_fork')
                self._decoder_rnn = LSTM(dim, name='decoder_rnn')
            children.extend([self._decoder_fork, self._decoder_rnn])
        elif decoder == 'skip-gram':
            dim_after_translate = emb_dim

        self._translate_layer = None
        activations = {'sigmoid': Logistic(), 'tanh': Tanh(), 'linear': None}

        if translate_layer:
            if type(translate_layer) == str:
                translate_layer = [translate_layer]
            assert (type(translate_layer) == list)
            activations_translate = [activations[a] for a in translate_layer]
            dims_translate = [
                dim,
            ] * len(translate_layer) + [dim_after_translate]
            self._translate_layer = MLP(activations_translate,
                                        dims_translate,
                                        name="translate_layer")
            children.append(self._translate_layer)

        if not self._tied_in_out:
            self._pre_softmax = Linear(emb_dim, self._num_output_words)
            children.append(self._pre_softmax)
        if decoder in ['lstm', 'lstm_c']:
            self._softmax = NDimensionalSoftmax()
        elif decoder in ['skip-gram']:
            self._softmax = Softmax()
        children.append(self._softmax)

        super(Seq2Seq, self).__init__(children=children, **kwargs)

    def _allocate(self):
        pass

    def _initialize(self):
        pass

    def get_embeddings_entries(self):
        return self._vocab.words

    def set_def_embeddings(self, embeddings, lookup='main'):
        if lookup == 'main':
            self._main_lookup.parameters[0].set_value(
                embeddings.astype(floatX))
        elif lookup == 'target':
            self._target_lookup.parameters[0].set_value(
                embeddings.astype(floatX))
        else:
            raise ValueError('Requested embedding not understood')

    def get_def_embeddings_params(self, lookup='main'):
        if lookup == 'main':
            return self._main_lookup.parameters[0]
        elif lookup == 'key':
            return self._key_lookup.parameters[0]
        elif lookup == 'target':
            return self._target_lookup.parameters[0]

        else:
            raise ValueError('Requested embedding not understood')

    def add_perplexity_measure(self, application_call, minus_logs, mask, name):
        sum_ce = (minus_logs * mask).sum()
        perplexity = T.exp(sum_ce / mask.sum())
        perplexity.tag.aggregation_scheme = Perplexity(sum_ce, mask.sum())
        application_call.add_auxiliary_variable(perplexity, name=name)
        return sum_ce / mask.sum()

    @application
    def apply(self,
              application_call,
              words,
              mask,
              keys=None,
              n_identical_keys=None,
              train_phase=True):
        """Compute the log-likelihood for a batch of sequences.

        words
            An integer matrix of shape (B, T), where T is the number of time
            step, B is the batch size. Note that this order of the axis is
            different from what all RNN bricks consume, hence and the axis
            should be transposed at some point.
        mask
            A float32 matrix of shape (B, T). Zeros indicate the padding.
        keys
            An integer matrix of shape (B). It contains the words that are 
            defined in the corresponding rows in words.

        """
        if not keys and self._proximity_coef != 0:
            raise ValueError(
                "Err: should provide keys when using penalty term")

        if not self._encoder and not keys:
            raise ValueError("Err: should provide keys (no encoder)")

        word_ids = self._word_to_id(words)
        if keys:
            key_ids = self._word_to_id(keys)

        # dropout

        unk = self._vocab.unk
        if self._word_dropout > 0 and train_phase:
            dropout_mask = T.ones_like(word_ids, dtype=int)
            dropout_mask = get_dropout_mask(dropout_mask, self._word_dropout)
            # this gives a matrix of 0 (dropped word) and ones (kept words)
            # replace 0s by unk token and 1s by word ids
            word_ids_dropped = (T.eq(dropout_mask, 1) * word_ids +
                                T.eq(dropout_mask, 0) * unk)
            word_ids_in = word_ids_dropped
        else:
            word_ids_in = word_ids

        # shortlisting
        # input_word_ids uses word dropout

        input_word_ids = (
            T.lt(word_ids_in, self._num_input_words) * word_ids_in +
            T.ge(word_ids_in, self._num_input_words) * unk)
        output_word_ids = (T.lt(word_ids, self._num_output_words) * word_ids +
                           T.ge(word_ids, self._num_output_words) * unk)

        if self._encoder or self._decoder != 'skip-gram':
            input_embeddings = self._main_lookup.apply(input_word_ids)

        # Encoder

        if self._encoder == 'lstm' or 'bilstm' in self._encoder:
            encoder_rnn_states = self._encoder_rnn.apply(T.transpose(
                self._encoder_fork.apply(input_embeddings), (1, 0, 2)),
                                                         mask=mask.T)[0]

            if self._encoder in ['lstm', 'bilstm', 'bilstm_sum']:
                gen_embeddings = encoder_rnn_states[-1]
            elif self._encoder in ['max_bilstm', 'max_bilstm_sum']:
                mask_bc = T.addbroadcast(mask.dimshuffle(0, 1, 'x'),
                                         2)  # (bs,L,dim)
                gen_embeddings = (input_embeddings * mask_bc +
                                  (1 - mask_bc) * -10**8).max(axis=1)
            else:
                raise ValueError("encoder {} apply not specific".format(
                    self._encoder))
        elif self._encoder == 'mean':
            mask_bc = T.addbroadcast(mask.dimshuffle(0, 1, 'x'), 2)
            gen_embeddings = (input_embeddings * mask_bc).mean(axis=1)
        elif self._encoder == 'weighted_mean':
            mask_bc = T.addbroadcast(mask.dimshuffle(0, 1, 'x'), 2)
            weights = self._encoder_w.apply(input_embeddings)
            weights = T.addbroadcast(weights, 2)
            weights = weights * mask_bc
            gen_embeddings = (input_embeddings * weights).mean(axis=1)
        elif not self._encoder:
            gen_embeddings = self._key_lookup.apply(key_ids)
        else:
            raise NotImplementedError()

        # Optional translation layer

        if self._translate_layer:
            in_decoder = self._translate_layer.apply(gen_embeddings)
        else:
            in_decoder = gen_embeddings  # (bs, dim)

        application_call.add_auxiliary_variable(in_decoder.copy(),
                                                name="embeddings")

        # Decoder

        if self._decoder in ['lstm', 'lstm_c']:
            if self._decoder == 'lstm_c':
                tiled_in_decoder = T.tile(in_decoder.dimshuffle(0, 'x', 1),
                                          (input_embeddings.shape[1], 1))
                input_embeddings = T.concatenate(
                    [input_embeddings, tiled_in_decoder], axis=2)

            decoded = self._decoder_rnn.apply(
                inputs=T.transpose(self._decoder_fork.apply(input_embeddings),
                                   (1, 0, 2)),
                mask=mask.T,
                states=in_decoder)[0]  # size (L, bs, dim)
            n_dim_decoded = 3
        elif self._decoder == 'skip-gram':
            decoded = in_decoder  # size (bs, dim)
            n_dim_decoded = 2
        else:
            raise NotImplementedError()

        # we ignore the <bos> token
        targets = output_word_ids.T[1:]  # (L-1, bs)
        targets_mask = mask.T[1:]  # (L-1,bs)

        # Compute log probabilities

        if n_dim_decoded == 2:
            # Case where we have only one distrib for all timesteps: skip-gram
            if self._tied_in_out:
                W_out = self.get_def_embeddings_params().transpose(
                )  # (dim, V)
                logits = T.dot(decoded, W_out)  # (bs, dim) x (dim,V) = (bs,V)
            else:
                logits = self._pre_softmax.apply(decoded)  # (bs, V)
            size_batch, length_sentence = output_word_ids.shape
            normalized_logits = self._softmax.log_probabilities(
                logits)  # (bs, V)
            indices = (targets.T + T.addbroadcast(
                (T.arange(size_batch) * logits.shape[1]).dimshuffle(
                    0, 'x'), 1)).flatten()  # (bs*L)
            minus_logs = -normalized_logits.flatten()[indices].reshape(
                (size_batch, length_sentence - 1)).T  # (L-1, bs)

        elif n_dim_decoded == 3:
            # Case where decoding is time dependent: recurrent decoders
            if self._tied_in_out:
                raise NotImplementedError()
                # TODO: implement... seems annoying because we need to replace
                # in the already implemented block code
            else:
                logits = self._pre_softmax.apply(decoded[:-1])  # (L-1, bs, V)
                minus_logs = self._softmax.categorical_cross_entropy(
                    targets, logits, extra_ndim=1)

        avg_CE = self.add_perplexity_measure(application_call, minus_logs,
                                             targets_mask, "perplexity")
        costs = self._reconstruction_coef * avg_CE

        if self._proximity_coef > 0:
            if not self._encoder:
                key_ids = self._key_to_id(keys)
            else:
                key_ids = self._word_to_id(keys)

            # shortlist: if we don't use all the input embeddings, we need to shortlist
            # so that there isn't a key error
            key_ids = (T.lt(key_ids, self._num_input_words) * key_ids +
                       T.ge(key_ids, self._num_input_words) * unk)

            if self._provide_targets:
                key_embeddings = self._target_lookup.apply(
                    key_ids)  #(bs, emb_dim)
            else:
                key_embeddings = self._main_lookup.apply(
                    key_ids)  #(bs, emb_dim)
            # don't penalize on UNK:
            mask = T.neq(key_ids, unk) * T.lt(key_ids, self._num_input_words)

            # average over dimension, and then manual averaging using the mask
            eps = T.constant(10**-6)

            if self._proximity_distance in ['l1', 'l2']:
                if self._proximity_distance == 'l1':
                    diff_embeddings = T.abs_(key_embeddings - in_decoder)
                else:
                    diff_embeddings = (key_embeddings - in_decoder)**2

                mask = mask.reshape((-1, 1))
                sum_proximity_term = T.sum(
                    T.mean(diff_embeddings * mask, axis=1))
                proximity_term = sum_proximity_term / (T.sum(mask) + eps)

            elif self._proximity_distance == 'cos':
                # numerator
                # TODO: debug
                mask = mask.reshape((-1, 1))  # (bs, 1)
                masked_keys = key_embeddings * mask
                masked_gen = in_decoder * mask
                dot_product_vector = T.sum(masked_keys * masked_gen,
                                           axis=1)  #(bs)
                # denominator
                product_sqr_norms = T.sum((masked_keys)**2, axis=1) * T.sum(
                    (masked_gen)**2, axis=1)
                denominator = T.sqrt(product_sqr_norms + eps)  #(bs)
                proximity_term = -T.sum(dot_product_vector / denominator) / (
                    T.sum(mask) + eps)

            application_call.add_auxiliary_variable(proximity_term.copy(),
                                                    name="proximity_term")
            costs = costs + self._proximity_coef * proximity_term

        return costs
Пример #52
0
class TestLSTM(unittest.TestCase):
    def setUp(self):
        self.lstm = LSTM(dim=3, weights_init=Constant(2),
                         biases_init=Constant(0))
        self.lstm.initialize()

    def test_one_step(self):
        h0 = tensor.matrix('h0')
        c0 = tensor.matrix('c0')
        x = tensor.matrix('x')
        h1, c1 = self.lstm.apply(x, h0, c0, iterate=False)
        next_h = theano.function(inputs=[x, h0, c0], outputs=[h1])

        h0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]],
                                   dtype=theano.config.floatX)
        c0_val = 0.1 * numpy.array([[1, 1, 0], [0, 1, 1]],
                                   dtype=theano.config.floatX)
        x_val = 0.1 * numpy.array([range(12), range(12, 24)],
                                  dtype=theano.config.floatX)
        W_state_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX)
        W_cell_to_in = 2 * numpy.ones((3,), dtype=theano.config.floatX)
        W_cell_to_out = 2 * numpy.ones((3,), dtype=theano.config.floatX)
        W_cell_to_forget = 2 * numpy.ones((3,), dtype=theano.config.floatX)

        # omitting biases because they are zero
        activation = numpy.dot(h0_val, W_state_val) + x_val

        def sigmoid(x):
            return 1. / (1. + numpy.exp(-x))

        i_t = sigmoid(activation[:, :3] + c0_val * W_cell_to_in)
        f_t = sigmoid(activation[:, 3:6] + c0_val * W_cell_to_forget)
        next_cells = f_t * c0_val + i_t * numpy.tanh(activation[:, 6:9])
        o_t = sigmoid(activation[:, 9:12] +
                      next_cells * W_cell_to_out)
        h1_val = o_t * numpy.tanh(next_cells)
        assert_allclose(h1_val, next_h(x_val, h0_val, c0_val)[0],
                        rtol=1e-6)

    def test_many_steps(self):
        x = tensor.tensor3('x')
        mask = tensor.matrix('mask')
        h, c = self.lstm.apply(x, mask=mask, iterate=True)
        calc_h = theano.function(inputs=[x, mask], outputs=[h])

        x_val = (0.1 * numpy.asarray(
            list(itertools.islice(itertools.permutations(range(12)), 0, 24)),
            dtype=theano.config.floatX))
        x_val = numpy.ones((24, 4, 12),
                           dtype=theano.config.floatX) * x_val[:, None, :]
        mask_val = numpy.ones((24, 4), dtype=theano.config.floatX)
        mask_val[12:24, 3] = 0
        h_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX)
        c_val = numpy.zeros((25, 4, 3), dtype=theano.config.floatX)
        W_state_val = 2 * numpy.ones((3, 12), dtype=theano.config.floatX)
        W_cell_to_in = 2 * numpy.ones((3,), dtype=theano.config.floatX)
        W_cell_to_out = 2 * numpy.ones((3,), dtype=theano.config.floatX)
        W_cell_to_forget = 2 * numpy.ones((3,), dtype=theano.config.floatX)

        def sigmoid(x):
            return 1. / (1. + numpy.exp(-x))

        for i in range(1, 25):
            activation = numpy.dot(h_val[i-1], W_state_val) + x_val[i-1]
            i_t = sigmoid(activation[:, :3] + c_val[i-1] * W_cell_to_in)
            f_t = sigmoid(activation[:, 3:6] + c_val[i-1] * W_cell_to_forget)
            c_val[i] = f_t * c_val[i-1] + i_t * numpy.tanh(activation[:, 6:9])
            o_t = sigmoid(activation[:, 9:12] +
                          c_val[i] * W_cell_to_out)
            h_val[i] = o_t * numpy.tanh(c_val[i])
            h_val[i] = (mask_val[i - 1, :, None] * h_val[i] +
                        (1 - mask_val[i - 1, :, None]) * h_val[i - 1])
            c_val[i] = (mask_val[i - 1, :, None] * c_val[i] +
                        (1 - mask_val[i - 1, :, None]) * c_val[i - 1])

        h_val = h_val[1:]
        assert_allclose(h_val, calc_h(x_val, mask_val)[0], rtol=1e-04)

        # Also test that initial state is a parameter
        initial1, initial2 = VariableFilter(roles=[INITIAL_STATE])(
            ComputationGraph(h))
        assert is_shared_variable(initial1)
        assert is_shared_variable(initial2)
        assert {initial1.name, initial2.name} == {
            'initial_state', 'initial_cells'}
Пример #53
0
    def __init__(self):
        inp = tensor.tensor3('input')
        inp = inp.dimshuffle(1,0,2)
        target = tensor.matrix('target')
        target = target.reshape((target.shape[0],))
        product = tensor.lvector('product')
        missing = tensor.eq(inp, 0)
        train_input_mean = 1470614.1
        train_input_std = 3256577.0

        trans_1 = tensor.concatenate((inp[1:,:,:],tensor.zeros((1,inp.shape[1],inp.shape[2]))), axis=0) 
        trans_2 = tensor.concatenate((tensor.zeros((1,inp.shape[1],inp.shape[2])), inp[:-1,:,:]), axis=0) 
        inp = tensor.switch(missing,(trans_1+trans_2)/2, inp)

        lookup = LookupTable(length = 352, dim=4*hidden_dim)
        product_embed= lookup.apply(product)

        salut = tensor.concatenate((inp, missing),axis =2)
        linear = Linear(input_dim=input_dim+1, output_dim=4*hidden_dim,
                        name="lstm_in")
        inter = linear.apply(salut)
        inter = inter + product_embed[None,:,:] 


        lstm = LSTM(dim=hidden_dim, activation=activation_function,
                    name="lstm")

        hidden, cells = lstm.apply(inter)

        linear2= Linear(input_dim = hidden_dim, output_dim = out_dim,
                       name="ouput_linear")

        pred = linear2.apply(hidden[-1])*train_input_std + train_input_mean
        pred = pred.reshape((product.shape[0],))
        
        cost = tensor.mean(abs((pred-target)/target)) 
        # Initialize all bricks
        for brick in [linear, linear2, lstm, lookup]:
            brick.weights_init = IsotropicGaussian(0.1)
            brick.biases_init = Constant(0.)
            brick.initialize()

        # Apply noise and dropout
        cg = ComputationGraph([cost])
        if w_noise_std > 0:
            noise_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, noise_vars, w_noise_std)
        if i_dropout > 0:
            cg = apply_dropout(cg, [hidden], i_dropout)
        [cost_reg] = cg.outputs
        cost_reg += 1e-20

        if cost_reg is not cost:
            self.cost = cost
            self.cost_reg = cost_reg

            cost_reg.name = 'cost_reg'
            cost.name = 'cost'

            self.sgd_cost = cost_reg

            self.monitor_vars = [[cost, cost_reg]]
        else:
            self.cost = cost
            cost.name = 'cost'

            self.sgd_cost = cost

            self.monitor_vars = [[cost]]

        self.pred = pred
        pred.name = 'pred'
Пример #54
0
class impatientLayer:
    # both visual and word feature are in the joint space
    # of dim: feature_dim
    # hidden_dim: dim of m
    # output_dim: final joint document query representation dim
    def __init__(self, feature_dim, hidden_dim, output_dim):
        self.image_embed = Linear(input_dim=feature_dim,
                                  output_dim=hidden_dim,
                                  weights_init=IsotropicGaussian(0.01),
                                  biases_init=Constant(0),
                                  use_bias=False,
                                  name='image_embed')
        self.word_embed = Linear(input_dim=feature_dim,
                                 output_dim=hidden_dim,
                                 weights_init=IsotropicGaussian(0.01),
                                 biases_init=Constant(0),
                                 use_bias=False,
                                 name='word_embed')
        self.r_embed = Linear(input_dim=feature_dim,
                              output_dim=hidden_dim,
                              weights_init=IsotropicGaussian(0.01),
                              biases_init=Constant(0),
                              use_bias=False,
                              name='r_embed')
        self.m_to_s = Linear(input_dim=hidden_dim,
                             output_dim=1,
                             weights_init=IsotropicGaussian(0.01),
                             biases_init=Constant(0),
                             use_bias=False,
                             name='m_to_s')
        self.attention_dist = Softmax(name='attention_dist_softmax')
        self.r_to_r = Linear(input_dim=feature_dim,
                             output_dim=feature_dim,
                             weights_init=IsotropicGaussian(0.01),
                             biases_init=Constant(0),
                             use_bias=False,
                             name='r_to_r')
        # self.r_to_g = Linear(input_dim=feature_dim,
        #                      output_dim=output_dim,
        #                      weights_init=IsotropicGaussian(0.01),
        #                      biases_init=Constant(0),
        #                      use_bias=False,
        #                      name='r_to_g')
        self.image_embed.initialize()
        self.word_embed.initialize()
        self.r_embed.initialize()
        self.m_to_s.initialize()
        self.r_to_r.initialize()
        # self.r_to_g.initialize()

        # the sequence to sequence LSTM
        self.seq = LSTM(output_dim,
                        name='rewatcher_seq',
                        weights_init=IsotropicGaussian(0.01),
                        biases_init=Constant(0))
        self.seq_embed = Linear(feature_dim,
                                output_dim * 4,
                                name='rewatcher_seq_embed',
                                weights_init=IsotropicGaussian(0.01),
                                biases_init=Constant(0),
                                use_bias=False)

        self.seq.initialize()
        self.seq_embed.initialize()

    # doc: row major batch_size x doc_length x feature_dim
    # query: row major batch_size x q x feature_dim
    # mask: mask of query batch_size
    # mask: length of a sentence - 1
    def apply(self, doc, query, mask_, batch_size):
        # batch_size x doc_length x hidden_dim
        mask = mask_.flatten()
        att1 = self.image_embed.apply(doc)

        # y_q_i: the ith token of question
        #        batch_size x feature_dim
        # r_1: r_m_1
        #        batch_size x feature_dim
        # y_d: document
        #        batch_size x doc_length x feature_dim
        # y_d_m: d-to-m
        #        batch_size x doc_length x hidden_dim
        def one_step(y_q_i, r_1, y_d, y_d_m):
            # batch_size x hidden_dim
            att2 = self.r_embed.apply(r_1)
            # batch_size x hidden_dim
            att3 = self.word_embed.apply(y_q_i)
            att = y_d_m + att2.dimshuffle(0, 'x', 1) + att3.dimshuffle(0, 'x', 1)
            # batch_size x doc_length x hidden_dim
            m = T.tanh(att)
            # batch_size x doc_length x 1
            s = self.m_to_s.apply(m)
            # batch_size x doc_length
            s = s.reshape((s.shape[0], s.shape[1]))
            s = self.attention_dist.apply(s)
            y_d_s = y_d.swapaxes(1, 2)
            # return batch_size x feature_dim
            return T.batched_dot(y_d_s, s) + T.tanh(self.r_to_r.apply(r_1))

        # query: batch_size x q x feature_dim
        # r: q x batch_size x feature_dim
        r, updates = theano.scan(fn=one_step,
                                 sequences=[query.swapaxes(0,1)],
                                 outputs_info=T.zeros_like(doc[:, 0, :]),
                                 non_sequences=[doc, att1],
                                 n_steps=query.shape[1],
                                 name='impatient layer')

        # for the sequence encoder
        # q x batch_size x output_dim
        Wr = self.seq_embed.apply(r)
        # q x batch_size x output_dim
        seq_r, garbage = self.seq.apply(Wr)
        # batch_size x feature_dim
        r_q = r[mask, T.arange(batch_size), :]
        seq_r_q = seq_r[mask, T.arange(batch_size), :]
        # batch_size x output_dim
        return r_q, seq_r_q
Пример #55
0
y_len = y_mask.sum(axis=0)
# inputt : T x B
# input_mask : T x B
# y : L x B
# y_mask : L x B

# Linear bricks in
input_to_h = LookupTable(num_input_classes, h_dim, name='lookup')
h = input_to_h.apply(inputt)
# h : T x B x h_dim

# RNN bricks
pre_lstm = Linear(input_dim=h_dim, output_dim=4*rec_dim, name='LSTM_linear')
lstm = LSTM(activation=Tanh(),
            dim=rec_dim, name="rnn")
rnn_out, _ = lstm.apply(pre_lstm.apply(h), mask=input_mask)

# Linear bricks out 
rec_to_o = Linear(name='rec_to_o',
                  input_dim=rec_dim,
                  output_dim=num_output_classes + 1)
y_hat_pre = rec_to_o.apply(rnn_out)
# y_hat_pre : T x B x C+1

# y_hat : T x B x C+1
y_hat = tensor.nnet.softmax(
    y_hat_pre.reshape((-1, num_output_classes + 1))
).reshape((y_hat_pre.shape[0], y_hat_pre.shape[1], -1))
y_hat.name = 'y_hat'

y_hat_mask = input_mask