def build_model(args): x = tensor.tensor3('features', dtype=floatX) y = tensor.tensor3('targets', dtype=floatX) linear = Linear(input_dim=1, output_dim=4 * args.units) rnn = LSTM(dim=args.units, activation=Tanh()) linear2 = Linear(input_dim=args.units, output_dim=1) prediction = Tanh().apply(linear2.apply(rnn.apply(linear.apply(x)))) prediction = prediction[:-1, :, :] # SquaredError does not work on 3D tensor y = y.reshape((y.shape[0] * y.shape[1], y.shape[2])) prediction = prediction.reshape((prediction.shape[0] * prediction.shape[1], prediction.shape[2])) cost = SquaredError().apply(y, prediction) # Initialization linear.weights_init = IsotropicGaussian(0.1) linear2.weights_init = IsotropicGaussian(0.1) linear.biases_init = Constant(0) linear2.biases_init = Constant(0) rnn.weights_init = Orthogonal() return cost
def get_presoft(h, args): output_size = get_output_size(args.dataset) # If args.skip_connections: dim = args.layers * args.state_dim # else: dim = args.state_dim use_all_states = args.skip_connections or args.skip_output or (args.rnn_type in ["clockwork", "soft"]) output_layer = Linear( input_dim=use_all_states * args.layers * args.state_dim + (1 - use_all_states) * args.state_dim, output_dim=output_size, name="output_layer") output_layer.weights_init = initialization.IsotropicGaussian(0.1) output_layer.biases_init = initialization.Constant(0) output_layer.initialize() presoft = output_layer.apply(h) if not has_indices(args.dataset): presoft = Tanh().apply(presoft) presoft.name = 'presoft' return presoft
def __init__(self, dim, activation=None, gate_activation=None, **kwargs): self.dim = dim if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation children = [activation, gate_activation] kwargs.setdefault('children', []).extend(children) super(ZoneoutGRU, self).__init__(**kwargs)
def getBidir(input_dim,input_var): """SimpleRecurrent-based bidirectionnal""" bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent( dim=input_dim, activation=Tanh())) #bidir.allocate() bidir.initialize() h = bidir.apply(input_var) net = add_softmax_layer(h, input_dim, 2) return net
def __init__(self, enc_transition, dims, dim_input, subsample, **kwargs): super(Encoder, self).__init__(**kwargs) self.subsample = subsample for layer_num, (dim_under, dim) in enumerate( zip([dim_input] + list(2 * numpy.array(dims)), dims)): bidir = Bidirectional(RecurrentWithFork(enc_transition( dim=dim, activation=Tanh()).apply, dim_under, name='with_fork'), name='bidir{}'.format(layer_num)) self.children.append(bidir)
def __init__(self, dim, activation=None, gate_activation=None, **kwargs): super(GatedRecurrent, self).__init__(**kwargs) self.dim = dim if not activation: activation = Tanh() if not gate_activation: gate_activation = Sigmoid() self.activation = activation self.gate_activation = gate_activation self.children = [activation, gate_activation]
def __init__(self, dim, activation=None, mlp=None, **kwargs): super(SoftGatedRecurrent, self).__init__(**kwargs) self.dim = dim if not activation: activation = Tanh() self.activation = activation # The activation of the mlp should be a Logistic function self.mlp = mlp self.children = [activation, mlp]
def __init__(self, dim, activation=None, gate_activation=None, **kwargs): self.dim = dim if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation children = [activation, gate_activation] + kwargs.get('children', []) super(GatedRecurrent, self).__init__(children=children, **kwargs)
def __init__(self, dim, activation=None, gate_activation=None, **kwargs): super(LSTMGraves, self).__init__(**kwargs) self.dim = dim if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation self.children = [activation, gate_activation]
def __init__(self, dimension, alphabet_size, **kwargs): super(WordReverser, self).__init__(**kwargs) encoder = Bidirectional( SimpleRecurrent(dim=dimension, activation=Tanh())) fork = Fork([ name for name in encoder.prototype.apply.sequences if name != 'mask' ]) fork.input_dim = dimension fork.output_dims = [ encoder.prototype.get_dim(name) for name in fork.input_names ] lookup = LookupTable(alphabet_size, dimension) transition = SimpleRecurrent(activation=Tanh(), dim=dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=2 * dimension, match_dim=dimension, name="attention") readout = Readout(readout_dim=alphabet_size, source_names=[ transition.apply.states[0], attention.take_glimpses.outputs[0] ], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( alphabet_size, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") self.lookup = lookup self.fork = fork self.encoder = encoder self.generator = generator self.children = [lookup, fork, encoder, generator]
def __init__(self, vocab_size, topical_embedding_dim, state_dim,word_num,batch_size, **kwargs): super(topicalq_transformer, self).__init__(**kwargs) self.vocab_size = vocab_size; self.word_embedding_dim = topical_embedding_dim; self.state_dim = state_dim; self.word_num=word_num; self.batch_size=batch_size; self.look_up=LookupTable(name='topical_embeddings'); self.transformer=MLP(activations=[Tanh()], dims=[self.word_embedding_dim*self.word_num, self.state_dim], name='topical_transformer'); self.children = [self.look_up,self.transformer];
def __init__(self, dim, attended_dim, activation=None, gate_activation=None, **kwargs): super(GRU, self).__init__(**kwargs) self.dim = dim self.attended_dim = attended_dim if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation self.initial_transformer = MLP(activations=[Tanh()], dims=[attended_dim, self.dim], name='state_initializer') self.children = [activation, gate_activation, self.initial_transformer]
def test_highway_activation(): x = T.matrix() highway = Highway(input_dim=100, output_activation=Tanh(), transform_activation=Identity()) highway.biases_init = Constant(0.0) highway.weights_init = init_Identity() y = highway.apply(x) highway.initialize() _func = theano.function([x], y) x_val = np.ones((4, 100), dtype=theano.config.floatX) ret = _func(x_val) assert_allclose(ret, np.tanh(np.ones((4, 100))))
def __init__(self, attended_dim, context_dim, **kwargs): super(GRUInitialStateWithInitialStateSumContext, self).__init__(**kwargs) self.attended_dim = attended_dim self.context_dim = context_dim # two MLPs which map to the same dimension, then we sum # the motivation here is to allow the network to pretrain on the normal MT, task, # then keep some params static, and continue training with the context-enhanced task # the state transformer self.initial_transformer = MLP(activations=[Tanh()], dims=[attended_dim, self.dim], name='state_initializer') # the context transformer self.context_transformer = MLP( activations=[Tanh(), Tanh(), Tanh()], dims=[context_dim, 2000, 1000, self.dim], name='context_initializer') self.children.extend( [self.initial_transformer, self.context_transformer])
def test_convolutional_sequence_with_convolutions_raw_activation(): seq = ConvolutionalSequence( [Convolutional(filter_size=(3, 3), num_filters=4), Rectifier(), Convolutional(filter_size=(5, 5), num_filters=3, step=(2, 2)), Tanh()], num_channels=2, image_size=(21, 39)) seq.allocate() x = theano.tensor.tensor4() out = seq.apply(x).eval({x: numpy.ones((10, 2, 21, 39), dtype=theano.config.floatX)}) assert out.shape == (10, 3, 8, 17)
def main(save_to, num_epochs): mlp = MLP([Tanh(), Softmax()], [784, 100, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') probs = mlp.apply(x) cost = CategoricalCrossEntropy().apply(y.flatten(), probs) error_rate = MisclassificationRate().apply(y.flatten(), probs) cg = ComputationGraph([cost]) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + .00005 * (W1**2).sum() + .00005 * (W2**2).sum() cost.name = 'final_cost' mnist_train = MNIST("train") mnist_test = MNIST("test") algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=Scale(learning_rate=0.1)) main_loop = MainLoop( algorithm, DataStream(mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 50)), model=Model(cost), extensions=[ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), Plot('MNIST example', channels=[[ 'test_final_cost', 'test_misclassificationrate_apply_error_rate' ], ['train_total_gradient_norm']]), Printing() ]) main_loop.run()
def test_activations(): x = tensor.vector() x_val = numpy.random.rand(8).astype(theano.config.floatX) exp_x_val = numpy.exp(x_val) assert_allclose(x_val, Identity().apply(x).eval({x: x_val})) assert_allclose(numpy.tanh(x_val), Tanh().apply(x).eval({x: x_val}), rtol=1e-06) assert_allclose(numpy.log(1 + exp_x_val), Softplus(x).apply(x).eval({x: x_val}), rtol=1e-6) assert_allclose(exp_x_val / numpy.sum(exp_x_val), Softmax(x).apply(x).eval({x: x_val}).flatten(), rtol=1e-6) assert_allclose(1.0 / (1.0 + numpy.exp(-x_val)), Logistic(x).apply(x).eval({x: x_val}), rtol=1e-6)
def __init__(self, vocab_size, embedding_dim, igru_state_dim, igru_depth, trg_dgru_depth, emitter=None, feedback_brick=None, merge=None, merge_prototype=None, post_merge=None, merged_dim=None, igru=None, **kwargs): # for compatible if igru_depth == 1: self.igru = IGRU(dim=igru_state_dim) else: self.igru = RecurrentStack( [IGRU(dim=igru_state_dim, name='igru')] + [ UpperIGRU(dim=igru_state_dim, activation=Tanh(), name='upper_igru' + str(i)) for i in range(1, igru_depth) ], skip_connections=True) self.igru_depth = igru_depth self.trg_dgru_depth = trg_dgru_depth self.lookup = LookupTable(name='embeddings') self.vocab_size = vocab_size self.igru_state_dim = igru_state_dim self.gru_to_softmax = Linear(input_dim=igru_state_dim, output_dim=vocab_size) self.embedding_dim = embedding_dim self.gru_fork = Fork([ name for name in self.igru.apply.sequences if name != 'mask' and name != 'input_states' ], prototype=Linear(), name='gru_fork') kwargs['children'] = [ self.igru, self.lookup, self.gru_to_softmax, self.gru_fork ] super(Interpolator, self).__init__(emitter=emitter, feedback_brick=feedback_brick, merge=merge, merge_prototype=merge_prototype, post_merge=post_merge, merged_dim=merged_dim, **kwargs)
def __init__(self, dimension, input_size, embed_input=False, **kwargs): super(LSTMEncoder, self).__init__(**kwargs) if embed_input: self.embedder = LookupTable(input_size, dimension) else: self.embedder = Linear(input_size, dimension) self.fork = Fork(['inputs'], dimension, output_dims=[dimension], prototype=Linear(dimension, 4 * dimension)) encoder = Bidirectional(LSTM(dim=dimension, activation=Tanh())) self.encoder = encoder self.children = [encoder, self.embedder, self.fork]
def example4(): """LSTM -> Plante lors de l'initialisation du lstm.""" x = tensor.tensor3('x') dim = 3 # gate_inputs = theano.function([x],x*4) gate_inputs = Linear(input_dim=dim, output_dim=dim * 4, name="linear", weights_init=initialization.Identity(), biases_init=Constant(2)) lstm = LSTM(dim=dim, activation=Tanh(), weights_init=IsotropicGaussian(), biases_init=Constant(0)) gate_inputs.initialize() hg = gate_inputs.apply(x) #print(gate_inputs.parameters) #print(gate_inputs.parameters[1].get_value()) lstm.initialize() h, cells = lstm.apply(hg) print(lstm.parameters) f = theano.function([x], h) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) print(f(np.ones((dim, 1, dim), dtype=theano.config.floatX))) print(f(4 * np.ones((dim, 1, dim), dtype=theano.config.floatX))) print("Good Job!") # lstm_output = #Initial State h0 = tensor.matrix('h0') c = tensor.matrix('cells') h, c1 = lstm.apply( inputs=x, states=h0, cells=c) # lstm.apply(states=h0,cells=cells,inputs=gate_inputs) f = theano.function([x, h0, c], h) print("a") print( f(np.ones((3, 1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX), np.ones((1, 3), dtype=theano.config.floatX)))
def __init__(self, vocab_size, embedding_dim, n_layers, skip_connections, state_dim, **kwargs): """Sole constructor. Args: vocab_size (int): Source vocabulary size embedding_dim (int): Dimension of the embedding layer n_layers (int): Number of layers. Layers share the same weight matrices. skip_connections (bool): Skip connections connect the source word embeddings directly with deeper layers to propagate the gradient more efficiently state_dim (int): Number of hidden units in the recurrent layers. """ super(DeepBidirectionalEncoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.n_layers = n_layers self.state_dim = state_dim self.skip_connections = skip_connections self.lookup = LookupTable(name='embeddings') self.bidirs = [] self.fwd_forks = [] self.back_forks = [] for i in xrange(self.n_layers): bidir = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name='bidir%d' % i) self.bidirs.append(bidir) self.fwd_forks.append( Fork([ name for name in bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='fwd_fork%d' % i)) self.back_forks.append( Fork([ name for name in bidir.prototype.apply.sequences if name != 'mask' ], prototype=Linear(), name='back_fork%d' % i)) self.children = [self.lookup] \ + self.bidirs \ + self.fwd_forks \ + self.back_forks
def create_rnn(hidden_dim, vocab_dim, mode="rnn"): # input x = tensor.imatrix('inchar') y = tensor.imatrix('outchar') # W = LookupTable( name="W1", #dim = hidden_dim*4, dim=hidden_dim, length=vocab_dim, weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0)) if mode == "lstm": # Long Short Term Memory H = LSTM(hidden_dim, name='H', weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0.0)) else: # recurrent history weight H = SimpleRecurrent( name="H", dim=hidden_dim, activation=Tanh(), weights_init=initialization.IsotropicGaussian(0.01)) # S = Linear(name="W2", input_dim=hidden_dim, output_dim=vocab_dim, weights_init=initialization.IsotropicGaussian(0.01), biases_init=initialization.Constant(0)) A = NDimensionalSoftmax(name="softmax") initLayers([W, H, S]) activations = W.apply(x) hiddens = H.apply(activations) #[0] activations2 = S.apply(hiddens) y_hat = A.apply(activations2, extra_ndim=1) cost = A.categorical_cross_entropy(y, activations2, extra_ndim=1).mean() cg = ComputationGraph(cost) #print VariableFilter(roles=[WEIGHT])(cg.variables) #W1,H,W2 = VariableFilter(roles=[WEIGHT])(cg.variables) layers = (x, W, H, S, A, y) return cg, layers, y_hat, cost
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.transition = GRUInitialState(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") readout = Readout(source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1), feedback_brick=LookupFeedbackWMT15( vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply ]), merged_dim=state_dim, merge_prototype=Linear(use_bias=True)) self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
def __init__(self, dim, activation=None, gate_activation=None, **kwargs): super(GatedRecurrent, self).__init__(**kwargs) self.dim = dim self.recurrent_weights_init = None self.initial_states_init = None if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation self.children = [activation, gate_activation]
def __init__(self, feature_size, embedding_dim, state_dim, **kwargs): super(BidirectionalAudioEncoder, self).__init__(**kwargs) self.feature_size = feature_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.embedding = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_embeddings") self.embedding_fwd_fork = Fork( [name for name in self.embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='embedding_fwd_fork') self.embedding_back_fork = Fork( [name for name in self.embedding.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='embedding_back_fork') self.bidir = BidirectionalWMT15(GatedRecurrent(activation=Tanh(), dim=state_dim), name="audio_representation") self.fwd_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='fwd_fork') self.back_fork = Fork( [name for name in self.bidir.prototype.apply.sequences if name != 'mask'], prototype=Linear(), name='back_fork') self.children = [self.bidir, self.embedding, self.fwd_fork, self.back_fork, self.embedding_fwd_fork, self.embedding_back_fork]
def apply_fc(x, fc_layers, fc_ws, fc_bs): out = x for layer in fc_layers: name, shape, act = layer w = {w.name: w for w in fc_ws}[name + '_w'] b = {b.name: b for b in fc_bs}[name + '_b'] if act == 'relu': act = Rectifier().apply elif act == 'tanh': act = Tanh().apply elif act == 'lin': act = lambda n: n out = tensor.dot(out, w) out = act(out + b) return out
def __init__(self, dim, activation=None, mlp=None, **kwargs): super(HardGatedRecurrent, self).__init__(**kwargs) self.dim = dim if not activation: activation = Tanh() self.activation = activation # The activation of the mlp should be a Logistic function self.mlp = mlp # The random stream self.randomstream = MRG_RandomStreams() self.children = [activation, mlp]
def __init__(self, dim, activation=None, gate_activation=None, model_type=6, ogates_zoneout=False, **kwargs): self.dim = dim self.model_type = model_type self.ogates_zoneout = ogates_zoneout if not activation: activation = Tanh() if not gate_activation: gate_activation = Logistic() self.activation = activation self.gate_activation = gate_activation children = [self.activation, self.gate_activation] kwargs.setdefault('children', []).extend(children) super(ZoneoutLSTM, self).__init__(**kwargs)
def __init__(self, enc_transition, dims, dim_input, subsample, bidir, **kwargs): super(Encoder, self).__init__(**kwargs) self.subsample = subsample dims_under = [dim_input] + list( (2 if bidir else 1) * numpy.array(dims)) for layer_num, (dim_under, dim) in enumerate(zip(dims_under, dims)): layer = RecurrentWithFork(enc_transition(dim=dim, activation=Tanh()).apply, dim_under, name='with_fork{}'.format(layer_num)) if bidir: layer = Bidirectional(layer, name='bidir{}'.format(layer_num)) self.children.append(layer) self.dim_encoded = (2 if bidir else 1) * dims[-1]
def test_super_in_recurrent_overrider(): # A regression test for the issue #475 class SimpleRecurrentWithContext(SimpleRecurrent): @application(contexts=['context']) def apply(self, context, *args, **kwargs): kwargs['inputs'] += context return super(SimpleRecurrentWithContext, self).apply(*args, **kwargs) @apply.delegate def apply_delegate(self): return super(SimpleRecurrentWithContext, self).apply brick = SimpleRecurrentWithContext(100, Tanh()) inputs = tensor.tensor3('inputs') context = tensor.matrix('context').dimshuffle('x', 0, 1) brick.apply(context, inputs=inputs)
def _build_bricks(self, *args, **kwargs): super(AttentionEUTHM2, self)._build_bricks() self.word_shift = MLP( activations=[Tanh('word_shift_tanh')], dims=[ self.config.user_embed_dim + self.config.word_embed_dim, self.config.word_embed_dim ], name='word_shift_mlp') self.word_shift.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim + self.config.user_embed_dim)) self.word_shift.biases_init = Constant(0) self.word_shift.initialize() self.word_shift_bias = Bias(dim=1, name='word_shift_bias') self.word_shift_bias.biases_init = Constant(0) self.word_shift_bias.initialize()
def test_integer_sequence_generator(): # Disclaimer: here we only check shapes, not values. readout_dim = 5 feedback_dim = 3 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim, weights_init=Orthogonal()) generator = SequenceGenerator(LinearReadout( readout_dim=readout_dim, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(readout_dim, feedback_dim), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.initialize() y = tensor.lmatrix('y') mask = tensor.matrix('mask') costs = generator.cost(y, mask) assert costs.ndim == 2 costs_val = theano.function([y, mask], [costs])(numpy.zeros((n_steps, batch_size), dtype='int64'), numpy.ones((n_steps, batch_size), dtype=floatX))[0] assert costs_val.shape == (n_steps, batch_size) states, outputs, costs = generator.generate(iterate=True, batch_size=batch_size, n_steps=n_steps) states_val, outputs_val, costs_val = theano.function( [], [states, outputs, costs], updates=costs.owner.inputs[0].owner.tag.updates)() assert states_val.shape == (n_steps, batch_size, dim) assert outputs_val.shape == (n_steps, batch_size) assert outputs_val.dtype == 'int64' assert costs_val.shape == (n_steps, batch_size)
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim readout = Readout( source_names=['states', 'feedback', 'readout_context'], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(), feedback_brick=LookupFeedback(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence( [Bias(dim=1000).apply, Maxout(num_pieces=2).apply, Linear(input_dim=state_dim / 2, output_dim=100, use_bias=False).apply, Linear(input_dim=100).apply]), merged_dim=1000) self.transition = GatedRecurrentWithContext(Tanh(), dim=state_dim, name='decoder') # Readout will apply the linear transformation to 'readout_context' # with a Merge brick, so no need to fork it here self.fork = Fork([name for name in self.transition.apply.contexts + self.transition.apply.states if name != 'readout_context'], prototype=Linear()) self.tanh = Tanh() self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, fork_inputs=[name for name in self.transition.apply.sequences if name != 'mask'], ) self.children = [self.fork, self.sequence_generator, self.tanh]
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) # Embed questions and cntext embed = LookupTable(vocab_size, config.embed_size, name='question_embed') bricks.append(embed) qembed = embed.apply(question) cembed = embed.apply(context) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + qlstms + clstms # Calculate question encoding (concatenate layer1) if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) if config.ctx_skip_connections: cenc_dim = 2*sum(config.ctx_lstm_size) cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2]))) att_weights.name = 'att_weights_0' att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights.name = 'att_weights' attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0) attended.name = 'attended' # Now we can calculate our output out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities], activations=config.out_mlp_activations + [Identity()], name='out_mlp') bricks += [out_mlp] probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1)) probs.name = 'probs' is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :], tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1) probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) # Calculate prediction, cost and error rate pred = probs.argmax(axis=1) cost = Softmax().categorical_cross_entropy(answer, probs).mean() error_rate = tensor.neq(answer, pred).mean() # Apply dropout cg = ComputationGraph([cost, error_rate]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg, error_rate_reg] = cg.outputs # Other stuff cost_reg.name = cost.name = 'cost' error_rate_reg.name = error_rate.name = 'error_rate' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg], [error_rate_reg]] self.monitor_vars_valid = [[cost], [error_rate]] # Initialize bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
class FRNNEmitter(AbstractEmitter, Initializable, Random): """An RNN emitter for the case of real outputs. Parameters ---------- """ def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, frnn_step_size, const=1e-5, **kwargs): super(FRNNEmitter, self).__init__(**kwargs) self.mlp = mlp self.target_size = target_size self.frame_size = frame_size self.k = k self.frnn_hidden_size = frnn_hidden_size self.const = const self.input_dim = self.mlp.output_dim self.frnn_step_size = frnn_step_size # adding a step if the division is not exact. self.number_of_steps = frame_size // frnn_step_size self.last_steps = frame_size % frnn_step_size if self.last_steps != 0: self.number_of_steps += 1 self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_mu") self.sigma = MLP( activations=[SoftPlus()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_sigma" ) self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.frnn_initial_state = Linear( input_dim=self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state" ) # self.frnn_hidden = Linear( # input_dim=frnn_hidden_size, # output_dim=frnn_hidden_size, # activation=Tanh(), # name="frnn_hidden") self.frnn_activation = Tanh(name="frnn_activation") self.frnn_linear_transition_state = Linear( input_dim=frnn_hidden_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_state" ) self.frnn_linear_transition_input = Linear( input_dim=self.frnn_step_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_input" ) # self.frnn_linear_transition_output = Linear ( # input_dim = frnn_hidden_size, # output_dim = self.rnn_hidden_dim, # name="frnn_linear_transition_output") self.children = [ self.mlp, self.mu, self.sigma, self.coeff, self.coeff2, self.frnn_initial_state, self.frnn_activation, self.frnn_linear_transition_state, self.frnn_linear_transition_input, ] @application def emit(self, readouts): """ keep_parameters is True if mu,sigma,coeffs must be stacked and returned if false, only the result is given, the others will be empty list. """ # initial state state = self.frnn_initial_state.apply(self.mlp.apply(readouts)) results = [] for i in range(self.number_of_steps): last_iteration = i == self.number_of_steps - 1 # First generating distribution parameters and sampling. mu = self.mu.apply(state) sigma = self.sigma.apply(state) + self.const coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const shape_result = coeff.shape shape_result = tensor.set_subtensor(shape_result[-1], self.frnn_step_size) ndim_result = coeff.ndim mu = mu.reshape((-1, self.frnn_step_size, self.k)) sigma = sigma.reshape((-1, self.frnn_step_size, self.k)) coeff = coeff.reshape((-1, self.k)) sample_coeff = self.theano_rng.multinomial(pvals=coeff, dtype=coeff.dtype) idx = predict(sample_coeff, axis=-1) # idx = predict(coeff, axis = -1) use this line for using most likely coeff. # shapes (ls*bs)*(fs) mu = mu[tensor.arange(mu.shape[0]), :, idx] sigma = sigma[tensor.arange(sigma.shape[0]), :, idx] epsilon = self.theano_rng.normal(size=mu.shape, avg=0.0, std=1.0, dtype=mu.dtype) result = mu + sigma * epsilon # *0.6 #reduce variance. result = result.reshape(shape_result, ndim=ndim_result) results.append(result) # if the total size does not correspond to the frame_size, # this removes the need for padding if not last_iteration: state = self.frnn_activation.apply( self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(result) ) results = tensor.stack(results, axis=-1) results = tensor.flatten(results, outdim=results.ndim - 1) # truncate if not good size if self.last_steps != 0: results = results[tuple([slice(0, None)] * (results.ndim - 1) + [slice(0, self.frame_size)])] return results @application def cost(self, readouts, outputs): # initial state state = self.frnn_initial_state.apply(self.mlp.apply(readouts)) inputs = outputs mus = [] sigmas = [] coeffs = [] for i in range(self.number_of_steps): last_iteration = i == self.number_of_steps - 1 # First generating distribution parameters and sampling. freq_mu = self.mu.apply(state) freq_sigma = self.sigma.apply(state) + self.const freq_coeff = self.coeff2.apply(self.coeff.apply(state), extra_ndim=state.ndim - 2) + self.const freq_mu = freq_mu.reshape((-1, self.frnn_step_size, self.k)) freq_sigma = freq_sigma.reshape((-1, self.frnn_step_size, self.k)) freq_coeff = freq_coeff.reshape((-1, self.k)) # mu,sigma: shape (-1,fs,k) # coeff: shape (-1,k) mus.append(freq_mu) sigmas.append(freq_sigma) coeffs.append(freq_coeff) index = self.frnn_step_size freq_inputs = inputs[ tuple([slice(0, None)] * (inputs.ndim - 1) + [slice(index, index + self.frnn_step_size)]) ] if not last_iteration: state = self.frnn_activation.apply( self.frnn_linear_transition_state.apply(state) + self.frnn_linear_transition_input.apply(freq_inputs) ) mus = tensor.stack(mus, axis=-2) sigmas = tensor.stack(sigmas, axis=-2) coeffs = tensor.stack(coeffs, axis=-2) mus = mus.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k)) sigmas = sigmas.reshape((-1, self.frnn_step_size * self.number_of_steps, self.k)) coeffs = coeffs.repeat(self.frnn_step_size, axis=-2) mus = mus[tuple([slice(0, None)] * (mus.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] sigmas = sigmas[tuple([slice(0, None)] * (sigmas.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] coeffs = coeffs[tuple([slice(0, None)] * (coeffs.ndim - 2) + [slice(0, self.frame_size)] + [slice(0, None)])] # actually prob not necessary mu = mus.reshape((-1, self.target_size)) sigma = sigmas.reshape((-1, self.target_size)) coeff = coeffs.reshape((-1, self.target_size)) return FRNN_NLL(y=outputs, mu=mu, sig=sigma, coeff=coeff, frame_size=self.frame_size, k=self.k) @application def initial_outputs(self, batch_size): return tensor.zeros((batch_size, self.frame_size), dtype=floatX) def get_dim(self, name): # modification here to ensure the right dim. if name == "outputs": return self.frame_size return super(FRNNEmitter, self).get_dim(name)
def get_prediction_function(): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') """ question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) """ # Embed questions and cntext embed = bricks[-5] qembed = embed.apply(question.dimshuffle(1, 0)) cembed = embed.apply(context.dimshuffle(1, 0)) global _qembed,_cembed _qembed = theano.function([question], qembed) _cembed = theano.function([context], cembed) qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.dimshuffle(1, 0).astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.dimshuffle(1, 0).astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') global _qhidden, _chidden _qhidden = theano.function([question, question_mask], qhidden_list) _chidden = theano.function([context, context_mask], chidden_list) # Calculate question encoding (concatenate layer1) if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) if config.ctx_skip_connections: cenc_dim = 2*sum(config.ctx_lstm_size) cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' global _qenc, _cenc _qenc = theano.function([question, question_mask], qenc) _cenc = theano.function([context, context_mask], cenc) # Attention mechanism MLP attention_mlp = bricks[-2] #attention_mlp attention_qlinear = bricks[4] #attq attention_clinear = bricks[11] # attc layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) global _attention_clinear, _attention_qlinear _attention_clinear = theano.function([context, context_mask], attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))).reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0]))) _attention_qlinear = theano.function([question, question_mask], attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2]))) att_weights.name = 'att_weights_0' att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights.name = 'att_weights' attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0) attended.name = 'attended' global _attended _attended = theano.function([question, question_mask, context, context_mask], attended) # Now we can calculate our output out_mlp = bricks[-1] #out_mlp probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1)) probs.name = 'probs' f = theano.function([question, question_mask, context, context_mask], probs) return f
class Decoder(Initializable): def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim readout = Readout( source_names=['states', 'feedback', 'readout_context'], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(), feedback_brick=LookupFeedback(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence( [Bias(dim=1000).apply, Maxout(num_pieces=2).apply, Linear(input_dim=state_dim / 2, output_dim=100, use_bias=False).apply, Linear(input_dim=100).apply]), merged_dim=1000) self.transition = GatedRecurrentWithContext(Tanh(), dim=state_dim, name='decoder') # Readout will apply the linear transformation to 'readout_context' # with a Merge brick, so no need to fork it here self.fork = Fork([name for name in self.transition.apply.contexts + self.transition.apply.states if name != 'readout_context'], prototype=Linear()) self.tanh = Tanh() self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, fork_inputs=[name for name in self.transition.apply.sequences if name != 'mask'], ) self.children = [self.fork, self.sequence_generator, self.tanh] def _push_allocation_config(self): self.fork.input_dim = self.representation_dim self.fork.output_dims = [self.state_dim for _ in self.fork.output_names] @application(inputs=['representation', 'target_sentence_mask', 'target_sentence'], outputs=['cost']) def cost(self, representation, target_sentence, target_sentence_mask): target_sentence = target_sentence.dimshuffle(1, 0) target_sentence_mask = target_sentence_mask.T # The initial state and contexts, all functions of the representation contexts = {key: value.dimshuffle('x', 0, 1) if key not in self.transition.apply.states else value for key, value in self.fork.apply(representation, as_dict=True).items()} contexts['states'] = self.tanh.apply(contexts['states']) cost = self.sequence_generator.cost(**merge( contexts, {'mask': target_sentence_mask, 'outputs': target_sentence, 'readout_context': representation.dimshuffle('x', 0, 1)} )) return (cost * target_sentence_mask).sum() / target_sentence_mask.shape[1]
def __init__(self, mlp, target_size, frame_size, k, frnn_hidden_size, frnn_step_size, const=1e-5, **kwargs): super(FRNNEmitter, self).__init__(**kwargs) self.mlp = mlp self.target_size = target_size self.frame_size = frame_size self.k = k self.frnn_hidden_size = frnn_hidden_size self.const = const self.input_dim = self.mlp.output_dim self.frnn_step_size = frnn_step_size # adding a step if the division is not exact. self.number_of_steps = frame_size // frnn_step_size self.last_steps = frame_size % frnn_step_size if self.last_steps != 0: self.number_of_steps += 1 self.mu = MLP(activations=[Identity()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_mu") self.sigma = MLP( activations=[SoftPlus()], dims=[frnn_hidden_size, k * frnn_step_size], name=self.name + "_sigma" ) self.coeff = MLP(activations=[Identity()], dims=[frnn_hidden_size, k], name=self.name + "_coeff") self.coeff2 = NDimensionalSoftmax() self.frnn_initial_state = Linear( input_dim=self.input_dim, output_dim=frnn_hidden_size, name="frnn_initial_state" ) # self.frnn_hidden = Linear( # input_dim=frnn_hidden_size, # output_dim=frnn_hidden_size, # activation=Tanh(), # name="frnn_hidden") self.frnn_activation = Tanh(name="frnn_activation") self.frnn_linear_transition_state = Linear( input_dim=frnn_hidden_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_state" ) self.frnn_linear_transition_input = Linear( input_dim=self.frnn_step_size, output_dim=frnn_hidden_size, name="frnn_linear_transition_input" ) # self.frnn_linear_transition_output = Linear ( # input_dim = frnn_hidden_size, # output_dim = self.rnn_hidden_dim, # name="frnn_linear_transition_output") self.children = [ self.mlp, self.mu, self.sigma, self.coeff, self.coeff2, self.frnn_initial_state, self.frnn_activation, self.frnn_linear_transition_state, self.frnn_linear_transition_input, ]
def __init__(self, config): inp = tensor.imatrix('bytes') embed = theano.shared(config.embedding_matrix.astype(theano.config.floatX), name='embedding_matrix') in_repr = embed[inp.flatten(), :].reshape((inp.shape[0], inp.shape[1], config.repr_dim)) in_repr.name = 'in_repr' bricks = [] states = [] # Construct predictive GRU hierarchy hidden = [] costs = [] next_target = in_repr.dimshuffle(1, 0, 2) for i, (hdim, cf, q) in enumerate(zip(config.hidden_dims, config.cost_factors, config.hidden_q)): init_state = theano.shared(numpy.zeros((config.num_seqs, hdim)).astype(theano.config.floatX), name='st0_%d'%i) linear = Linear(input_dim=config.repr_dim, output_dim=3*hdim, name="lstm_in_%d"%i) lstm = GatedRecurrent(dim=hdim, activation=config.activation_function, name="lstm_rec_%d"%i) linear2 = Linear(input_dim=hdim, output_dim=config.repr_dim, name='lstm_out_%d'%i) tanh = Tanh('lstm_out_tanh_%d'%i) bricks += [linear, lstm, linear2, tanh] if i > 0: linear1 = Linear(input_dim=config.hidden_dims[i-1], output_dim=3*hdim, name='lstm_in2_%d'%i) bricks += [linear1] next_target = tensor.cast(next_target, dtype=theano.config.floatX) inter = linear.apply(theano.gradient.disconnected_grad(next_target)) if i > 0: inter += linear1.apply(theano.gradient.disconnected_grad(hidden[-1][:-1,:,:])) new_hidden = lstm.apply(inputs=inter[:,:,:hdim], gate_inputs=inter[:,:,hdim:], states=init_state) states.append((init_state, new_hidden[-1, :, :])) hidden += [tensor.concatenate([init_state[None,:,:], new_hidden],axis=0)] pred = tanh.apply(linear2.apply(hidden[-1][:-1,:,:])) costs += [numpy.float32(cf) * (-next_target * pred).sum(axis=2).mean()] costs += [numpy.float32(cf) * q * abs(pred).sum(axis=2).mean()] diff = next_target - pred next_target = tensor.ge(diff, 0.5) - tensor.le(diff, -0.5) # Construct output from hidden states hidden = [s.dimshuffle(1, 0, 2) for s in hidden] out_parts = [] out_dims = config.out_hidden + [config.io_dim] for i, (dim, state) in enumerate(zip(config.hidden_dims, hidden)): pred_linear = Linear(input_dim=dim, output_dim=out_dims[0], name='pred_linear_%d'%i) bricks.append(pred_linear) lin = theano.gradient.disconnected_grad(state) out_parts.append(pred_linear.apply(lin)) # Do prediction and calculate cost out = sum(out_parts) if len(out_dims) > 1: out = config.out_hidden_act[0](name='out_act0').apply(out) mlp = MLP(dims=out_dims, activations=[x(name='out_act%d'%i) for i, x in enumerate(config.out_hidden_act[1:])] +[Identity()], name='out_mlp') bricks.append(mlp) out = mlp.apply(out.reshape((inp.shape[0]*(inp.shape[1]+1),-1)) ).reshape((inp.shape[0],inp.shape[1]+1,-1)) pred = out.argmax(axis=2) cost = Softmax().categorical_cross_entropy(inp.flatten(), out[:,:-1,:].reshape((inp.shape[0]*inp.shape[1], config.io_dim))).mean() error_rate = tensor.neq(inp.flatten(), pred[:,:-1].flatten()).mean() sgd_cost = cost + sum(costs) # Initialize all bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize() # apply noise cg = ComputationGraph([sgd_cost, cost, error_rate]+costs) if config.weight_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.weight_noise) sgd_cost = cg.outputs[0] cost = cg.outputs[1] error_rate = cg.outputs[2] costs = cg.outputs[3:] # put stuff into self that is usefull for training or extensions self.sgd_cost = sgd_cost sgd_cost.name = 'sgd_cost' for i in range(len(costs)): costs[i].name = 'pred_cost_%d'%i cost.name = 'cost' error_rate.name = 'error_rate' self.monitor_vars = [costs, [cost], [error_rate]] self.out = out[:,1:,:] self.pred = pred[:,1:] self.states = states
def __init__(self, config, vocab_size): question = tensor.imatrix('question') # set up 32-bit integer matrices question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') # and the multple choice answers: ans1 = tensor.ivector('ans1') ans1_mask = tensor.ivector('ans1_mask') ans2 = tensor.ivector('ans2') ans2_mask = tensor.ivector('ans2_mask') ans3 = tensor.ivector('ans3') ans3_mask = tensor.ivector('ans3_mask') ans4 = tensor.ivector('ans4') ans4_mask = tensor.ivector('ans4_mask') bricks = [] # inverts 1st and 2nd dimensions of matrix question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) # Embed questions and cntext embed = LookupTable(vocab_size, config.embed_size, name='question_embed') bricks.append(embed) qembed = embed.apply(question) cembed = embed.apply(context) a1embed = embed.apply(ans1) a2embed = embed.apply(ans2) a3embed = embed.apply(ans3) a4embed = embed.apply(ans4) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + qlstms + clstms # Calculate question encoding (concatenate layer1) if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) if config.ctx_skip_connections: cenc_dim = 2*sum(config.ctx_lstm_size) cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2]))) att_weights.name = 'att_weights_0' att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights.name = 'att_weights' attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0) attended.name = 'attended' # Now we can calculate our output out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities], activations=config.out_mlp_activations + [Identity()], name='out_mlp') bricks += [out_mlp] probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1)) probs.name = 'probs' # not needed anymore, since we're not only looking at entities # is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :], # tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1) # probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) # Calculate prediction, cost and error rate # vocab = tensor.arange(10) # probs = numpy.asarray([0, 0.8, 0, 0.2], dtype=numpy.float32) # context = numpy.asarray([3, 2, 8, 1], dtype=numpy.int32) # ans3 = numpy.asarray([2, 8, 1], dtype=numpy.int32) # ans1 = numpy.asarray([1, 3, 4], dtype=numpy.int32) # ans2 = numpy.asarray([1, 1, 4], dtype=numpy.int32) # convert probs vector to one that's the same size as vocab, with all zeros except probs: # probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) probsPadded = tensor.zeros_like(vocab_size, dtype=numpy.float32) probsSubset = probsPadded[cembed] #TODO this should be masked b = tensor.set_subtensor(probsSubset, probs) # get the similarity score of each (masked) answer with the context probs: ans1probs = b[a1enc] ans1score = tensor.switch(ans1_mask, ans1probs, tensor.zeros_like(ans1probs)).sum() ans2probs = b[a2enc] ans2score = ans2probs.sum() ans3probs = b[a3enc] ans3score = ans3probs.sum() ans4probs = b[a4enc] ans4score = ans4probs.sum() # and pick the best one: allans = tensor.stacklists([ans1score, ans2score, ans3score, ans4score]) pred = tensor.argmax(allans) cg = ComputationGraph([ans1probs, ans1score, ans2probs, ans2score, ans3probs, ans3score, ans4probs, ans4score, allans, pred]) f = cg.get_theano_function() out = f() #pred = probs.argmax(axis=1) #print "pred" #print pred TODO CHANGE THIS! cost = Softmax().categorical_cross_entropy(answer, probs).mean() error_rate = tensor.neq(answer, pred).mean() # Apply dropout cg = ComputationGraph([cost, error_rate]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg, error_rate_reg] = cg.outputs # Other stuff cost_reg.name = cost.name = 'cost' error_rate_reg.name = error_rate.name = 'error_rate' self.probs = probs self.probs.name = "probs" self.cost = cost self.cost.name = "cost" # self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg], [error_rate_reg]] self.monitor_vars_valid = [[cost], [error_rate]] # Initialize bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()