Python Bidirectional примеры использования

Язык программирования: Python

Пространство имен/Пакет: blocks.bricks.recurrent.misc

Класс/Тип: Bidirectional

Примеров на hotexamples.com: 5

Python Bidirectional - 5 примеров найдено. Это лучшие примеры Python кода для blocks.bricks.recurrent.misc.Bidirectional, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

Bidirectional(3)

apply(2)

Основные методы

Bidirectional (3)

apply (2)

Пример #1

Показать файл

Файл: extractive_qa_model.py Проект: fsonntag/dict_based_learning

    def __init__(self, dim, emb_dim, readout_dims, num_input_words,
                 def_num_input_words, vocab, use_definitions, def_word_gating,
                 compose_type, coattention, def_reader, reuse_word_embeddings,
                 random_unk, **kwargs):
        self._vocab = vocab
        if emb_dim == 0:
            emb_dim = dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if def_num_input_words == 0:
            def_num_input_words = num_input_words

        self._coattention = coattention
        self._num_input_words = num_input_words
        self._use_definitions = use_definitions
        self._random_unk = random_unk
        self._reuse_word_embeddings = reuse_word_embeddings

        lookup_num_words = num_input_words
        if reuse_word_embeddings:
            lookup_num_words = max(num_input_words, def_num_input_words)
        if random_unk:
            lookup_num_words = vocab.size()

        # Dima: we can have slightly less copy-paste here if we
        # copy the RecurrentFromFork class from my other projects.
        children = []
        self._lookup = LookupTable(lookup_num_words, emb_dim)
        self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
        self._encoder_rnn = LSTM(dim, name='encoder_rnn')
        self._question_transform = Linear(dim, dim, name='question_transform')
        self._bidir_fork = Linear(3 * dim if coattention else 2 * dim,
                                  4 * dim,
                                  name='bidir_fork')
        self._bidir = Bidirectional(LSTM(dim), name='bidir')
        children.extend([
            self._lookup, self._encoder_fork, self._encoder_rnn,
            self._question_transform, self._bidir, self._bidir_fork
        ])

        activations = [Rectifier()] * len(readout_dims) + [None]
        readout_dims = [2 * dim] + readout_dims + [1]
        self._begin_readout = MLP(activations,
                                  readout_dims,
                                  name='begin_readout')
        self._end_readout = MLP(activations, readout_dims, name='end_readout')
        self._softmax = NDimensionalSoftmax()
        children.extend(
            [self._begin_readout, self._end_readout, self._softmax])

        if self._use_definitions:
            # A potential bug here: we pass the same vocab to the def reader.
            # If a different token is reserved for UNK in text and in the definitions,
            # we can be screwed.
            def_reader_class = eval(def_reader)
            def_reader_kwargs = dict(
                num_input_words=def_num_input_words,
                dim=dim,
                emb_dim=emb_dim,
                vocab=vocab,
                lookup=self._lookup if reuse_word_embeddings else None)
            if def_reader_class == MeanPoolReadDefinitions:
                def_reader_kwargs.update(dict(normalize=True, translate=False))
            self._def_reader = def_reader_class(**def_reader_kwargs)
            self._combiner = MeanPoolCombiner(dim=dim,
                                              emb_dim=emb_dim,
                                              def_word_gating=def_word_gating,
                                              compose_type=compose_type)
            children.extend([self._def_reader, self._combiner])

        super(ExtractiveQAModel, self).__init__(children=children, **kwargs)

        # create default input variables
        self.contexts = tensor.lmatrix('contexts')
        self.context_mask = tensor.matrix('contexts_mask')
        self.questions = tensor.lmatrix('questions')
        self.question_mask = tensor.matrix('questions_mask')
        self.answer_begins = tensor.lvector('answer_begins')
        self.answer_ends = tensor.lvector('answer_ends')
        input_vars = [
            self.contexts, self.context_mask, self.questions,
            self.question_mask, self.answer_begins, self.answer_ends
        ]
        if self._use_definitions:
            self.defs = tensor.lmatrix('defs')
            self.def_mask = tensor.matrix('def_mask')
            self.contexts_def_map = tensor.lmatrix('contexts_def_map')
            self.questions_def_map = tensor.lmatrix('questions_def_map')
            input_vars.extend([
                self.defs, self.def_mask, self.contexts_def_map,
                self.questions_def_map
            ])
        self.input_vars = OrderedDict([(var.name, var) for var in input_vars])

Пример #2

Показать файл

Файл: extractive_qa_model.py Проект: fsonntag/dict_based_learning

class ExtractiveQAModel(Initializable):
    """The dictionary-equipped extractive QA model.

    Parameters
    ----------
    dim : int
        The default dimensionality for the components.
    emd_dim : int
        The dimensionality for the embeddings. If 0, `dim` is used.
    coattention : bool
        Use the coattention mechanism.
    num_input_words : int
        The number of input words. If 0, `vocab.size()` is used.
        The vocabulary object.
    use_definitions : bool
        Triggers the use of definitions.
    reuse_word_embeddings : bool
    compose_type : str

    """
    def __init__(self, dim, emb_dim, readout_dims, num_input_words,
                 def_num_input_words, vocab, use_definitions, def_word_gating,
                 compose_type, coattention, def_reader, reuse_word_embeddings,
                 random_unk, **kwargs):
        self._vocab = vocab
        if emb_dim == 0:
            emb_dim = dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if def_num_input_words == 0:
            def_num_input_words = num_input_words

        self._coattention = coattention
        self._num_input_words = num_input_words
        self._use_definitions = use_definitions
        self._random_unk = random_unk
        self._reuse_word_embeddings = reuse_word_embeddings

        lookup_num_words = num_input_words
        if reuse_word_embeddings:
            lookup_num_words = max(num_input_words, def_num_input_words)
        if random_unk:
            lookup_num_words = vocab.size()

        # Dima: we can have slightly less copy-paste here if we
        # copy the RecurrentFromFork class from my other projects.
        children = []
        self._lookup = LookupTable(lookup_num_words, emb_dim)
        self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
        self._encoder_rnn = LSTM(dim, name='encoder_rnn')
        self._question_transform = Linear(dim, dim, name='question_transform')
        self._bidir_fork = Linear(3 * dim if coattention else 2 * dim,
                                  4 * dim,
                                  name='bidir_fork')
        self._bidir = Bidirectional(LSTM(dim), name='bidir')
        children.extend([
            self._lookup, self._encoder_fork, self._encoder_rnn,
            self._question_transform, self._bidir, self._bidir_fork
        ])

        activations = [Rectifier()] * len(readout_dims) + [None]
        readout_dims = [2 * dim] + readout_dims + [1]
        self._begin_readout = MLP(activations,
                                  readout_dims,
                                  name='begin_readout')
        self._end_readout = MLP(activations, readout_dims, name='end_readout')
        self._softmax = NDimensionalSoftmax()
        children.extend(
            [self._begin_readout, self._end_readout, self._softmax])

        if self._use_definitions:
            # A potential bug here: we pass the same vocab to the def reader.
            # If a different token is reserved for UNK in text and in the definitions,
            # we can be screwed.
            def_reader_class = eval(def_reader)
            def_reader_kwargs = dict(
                num_input_words=def_num_input_words,
                dim=dim,
                emb_dim=emb_dim,
                vocab=vocab,
                lookup=self._lookup if reuse_word_embeddings else None)
            if def_reader_class == MeanPoolReadDefinitions:
                def_reader_kwargs.update(dict(normalize=True, translate=False))
            self._def_reader = def_reader_class(**def_reader_kwargs)
            self._combiner = MeanPoolCombiner(dim=dim,
                                              emb_dim=emb_dim,
                                              def_word_gating=def_word_gating,
                                              compose_type=compose_type)
            children.extend([self._def_reader, self._combiner])

        super(ExtractiveQAModel, self).__init__(children=children, **kwargs)

        # create default input variables
        self.contexts = tensor.lmatrix('contexts')
        self.context_mask = tensor.matrix('contexts_mask')
        self.questions = tensor.lmatrix('questions')
        self.question_mask = tensor.matrix('questions_mask')
        self.answer_begins = tensor.lvector('answer_begins')
        self.answer_ends = tensor.lvector('answer_ends')
        input_vars = [
            self.contexts, self.context_mask, self.questions,
            self.question_mask, self.answer_begins, self.answer_ends
        ]
        if self._use_definitions:
            self.defs = tensor.lmatrix('defs')
            self.def_mask = tensor.matrix('def_mask')
            self.contexts_def_map = tensor.lmatrix('contexts_def_map')
            self.questions_def_map = tensor.lmatrix('questions_def_map')
            input_vars.extend([
                self.defs, self.def_mask, self.contexts_def_map,
                self.questions_def_map
            ])
        self.input_vars = OrderedDict([(var.name, var) for var in input_vars])

    def set_embeddings(self, embeddings):
        self._lookup.parameters[0].set_value(
            embeddings.astype(theano.config.floatX))

    def embeddings_var(self):
        return self._lookup.parameters[0]

    def def_reading_parameters(self):
        parameters = Selector(self._def_reader).get_parameters().values()
        parameters.extend(Selector(self._combiner).get_parameters().values())
        if self._reuse_word_embeddings:
            lookup_parameters = Selector(
                self._lookup).get_parameters().values()
            parameters = [p for p in parameters if p not in lookup_parameters]
        return parameters

    @application
    def _encode(self,
                application_call,
                text,
                mask,
                def_embs=None,
                def_map=None,
                text_name=None):
        if not self._random_unk:
            text = (tensor.lt(text, self._num_input_words) * text +
                    tensor.ge(text, self._num_input_words) * self._vocab.unk)
        if text_name:
            application_call.add_auxiliary_variable(
                unk_ratio(text, mask, self._vocab.unk),
                name='{}_unk_ratio'.format(text_name))
        embs = self._lookup.apply(text)
        if self._random_unk:
            embs = (tensor.lt(text, self._num_input_words)[:, :, None] * embs +
                    tensor.ge(text, self._num_input_words)[:, :, None] *
                    disconnected_grad(embs))
        if def_embs:
            embs = self._combiner.apply(embs, mask, def_embs, def_map)
        add_role(embs, EMBEDDINGS)
        encoded = flip01(
            self._encoder_rnn.apply(self._encoder_fork.apply(flip01(embs)),
                                    mask=mask.T)[0])
        return encoded

    @application
    def apply(self,
              application_call,
              contexts,
              contexts_mask,
              questions,
              questions_mask,
              answer_begins,
              answer_ends,
              defs=None,
              def_mask=None,
              contexts_def_map=None,
              questions_def_map=None):
        def_embs = None
        if self._use_definitions:
            def_embs = self._def_reader.apply(defs, def_mask)

        context_enc = self._encode(contexts, contexts_mask, def_embs,
                                   contexts_def_map, 'context')
        question_enc_pre = self._encode(questions, questions_mask, def_embs,
                                        questions_def_map, 'question')
        question_enc = tensor.tanh(
            self._question_transform.apply(question_enc_pre))

        # should be (batch size, context length, question_length)
        affinity = tensor.batched_dot(context_enc, flip12(question_enc))
        affinity_mask = contexts_mask[:, :, None] * questions_mask[:, None, :]
        affinity = affinity * affinity_mask - 1000.0 * (1 - affinity_mask)
        # soft-aligns every position in the context to positions in the question
        d2q_att_weights = self._softmax.apply(affinity, extra_ndim=1)
        application_call.add_auxiliary_variable(d2q_att_weights.copy(),
                                                name='d2q_att_weights')
        # soft-aligns every position in the question to positions in the document
        q2d_att_weights = self._softmax.apply(flip12(affinity), extra_ndim=1)
        application_call.add_auxiliary_variable(q2d_att_weights.copy(),
                                                name='q2d_att_weights')

        # question encoding "in the view of the document"
        question_enc_informed = tensor.batched_dot(q2d_att_weights,
                                                   context_enc)
        question_enc_concatenated = tensor.concatenate(
            [question_enc, question_enc_informed], 2)
        # document encoding "in the view of the question"
        context_enc_informed = tensor.batched_dot(d2q_att_weights,
                                                  question_enc_concatenated)

        if self._coattention:
            context_enc_concatenated = tensor.concatenate(
                [context_enc, context_enc_informed], 2)
        else:
            question_repr_repeated = tensor.repeat(question_enc[:, [-1], :],
                                                   context_enc.shape[1],
                                                   axis=1)
            context_enc_concatenated = tensor.concatenate(
                [context_enc, question_repr_repeated], 2)

        # note: forward and backward LSTMs share the
        # input weights in the current impl
        bidir_states = flip01(
            self._bidir.apply(self._bidir_fork.apply(
                flip01(context_enc_concatenated)),
                              mask=contexts_mask.T)[0])

        begin_readouts = self._begin_readout.apply(bidir_states)[:, :, 0]
        begin_readouts = begin_readouts * contexts_mask - 1000.0 * (
            1 - contexts_mask)
        begin_costs = self._softmax.categorical_cross_entropy(
            answer_begins, begin_readouts)

        end_readouts = self._end_readout.apply(bidir_states)[:, :, 0]
        end_readouts = end_readouts * contexts_mask - 1000.0 * (1 -
                                                                contexts_mask)
        end_costs = self._softmax.categorical_cross_entropy(
            answer_ends, end_readouts)

        predicted_begins = begin_readouts.argmax(axis=-1)
        predicted_ends = end_readouts.argmax(axis=-1)
        exact_match = (tensor.eq(predicted_begins, answer_begins) *
                       tensor.eq(predicted_ends, answer_ends))
        application_call.add_auxiliary_variable(predicted_begins,
                                                name='predicted_begins')
        application_call.add_auxiliary_variable(predicted_ends,
                                                name='predicted_ends')
        application_call.add_auxiliary_variable(exact_match,
                                                name='exact_match')

        return begin_costs + end_costs

    def apply_with_default_vars(self):
        return self.apply(*self.input_vars.values())

Пример #3

Показать файл

    def __init__(
            self,
            dim,
            emb_dim,
            vocab,
            def_emb_translate_dim=-1,
            def_dim=-1,
            encoder='bilstm',
            bn=True,
            def_reader=None,
            def_combiner=None,
            dropout=0.5,
            num_input_words=-1,
            # Others
            **kwargs):

        self._dropout = dropout
        self._vocab = vocab
        self._emb_dim = emb_dim
        self._def_reader = def_reader
        self._def_combiner = def_combiner

        if encoder != 'bilstm':
            raise NotImplementedError()

        if def_emb_translate_dim < 0:
            self.def_emb_translate_dim = emb_dim
        else:
            self.def_emb_translate_dim = def_emb_translate_dim

        if def_dim < 0:
            self._def_dim = emb_dim
        else:
            self._def_dim = def_dim

        if num_input_words > 0:
            logger.info("Restricting vocab to " + str(num_input_words))
            self._num_input_words = num_input_words
        else:
            self._num_input_words = vocab.size()

        children = []

        if self.def_emb_translate_dim != self._emb_dim:
            self._translate_pre_def = Linear(input_dim=emb_dim,
                                             output_dim=def_emb_translate_dim)
            children.append(self._translate_pre_def)
        else:
            self._translate_pre_def = None

        ## Embedding
        self._lookup = LookupTable(self._num_input_words,
                                   emb_dim,
                                   weights_init=GlorotUniform())
        children.append(self._lookup)

        if def_reader:
            self._final_emb_dim = self._def_dim
            self._def_reader = def_reader
            self._def_combiner = def_combiner
            children.extend([self._def_reader, self._def_combiner])
        else:
            self._final_emb_dim = self._emb_dim

        ## BiLSTM
        self._hyp_bidir_fork = Linear(
            self._def_dim if def_reader else self._emb_dim,
            4 * dim,
            name='hyp_bidir_fork')
        self._hyp_bidir = Bidirectional(LSTM(dim), name='hyp_bidir')
        self._prem_bidir_fork = Linear(
            self._def_dim if def_reader else self._emb_dim,
            4 * dim,
            name='prem_bidir_fork')
        self._prem_bidir = Bidirectional(LSTM(dim), name='prem_bidir')
        children.extend([self._hyp_bidir_fork, self._hyp_bidir])
        children.extend([self._prem_bidir, self._prem_bidir_fork])

        ## BiLSTM no. 2 (encoded attentioned embeddings)
        self._hyp_bidir_fork2 = Linear(8 * dim,
                                       4 * dim,
                                       name='hyp_bidir_fork2')
        self._hyp_bidir2 = Bidirectional(LSTM(dim), name='hyp_bidir2')
        self._prem_bidir_fork2 = Linear(8 * dim,
                                        4 * dim,
                                        name='prem_bidir_fork2')
        self._prem_bidir2 = Bidirectional(LSTM(dim), name='prem_bidir2')
        children.extend([self._hyp_bidir_fork2, self._hyp_bidir2])
        children.extend([self._prem_bidir2, self._prem_bidir_fork2])

        self._rnns = [
            self._prem_bidir2, self._hyp_bidir2, self._prem_bidir,
            self._hyp_bidir
        ]

        ## MLP
        if bn:
            self._mlp = BatchNormalizedMLP([Tanh()], [8 * dim, dim],
                                           conserve_memory=False,
                                           name="mlp")
            self._pred = BatchNormalizedMLP([Softmax()], [dim, 3],
                                            conserve_memory=False,
                                            name="pred_mlp")
        else:
            self._mlp = MLP([Tanh()], [8 * dim, dim], name="mlp")
            self._pred = MLP([Softmax()], [dim, 3], name="pred_mlp")

        children.append(self._mlp)
        children.append(self._pred)

        ## Softmax
        self._ndim_softmax = NDimensionalSoftmax()
        children.append(self._ndim_softmax)

        super(ESIM, self).__init__(children=children, **kwargs)

Пример #4

Показать файл

Файл: seq2seq.py Проект: tombosc/cpae

    def __init__(self,
                 emb_dim,
                 dim,
                 num_input_words,
                 num_output_words,
                 vocab,
                 proximity_coef=0,
                 proximity_distance='l2',
                 encoder='lstm',
                 decoder='lstm',
                 shared_rnn=False,
                 translate_layer=None,
                 word_dropout=0.,
                 tied_in_out=False,
                 vocab_keys=None,
                 seed=0,
                 reconstruction_coef=1.,
                 provide_targets=False,
                 **kwargs):
        """
        translate_layer: either a string containing the activation function to use
                         either a list containg the list of activations for a MLP
        """
        if emb_dim == 0:
            emb_dim = dim
        if num_input_words == 0:
            num_input_words = vocab.size()
        if num_output_words == 0:
            num_output_words = vocab.size()

        self._word_dropout = word_dropout

        self._tied_in_out = tied_in_out

        if not encoder:
            if proximity_coef:
                raise ValueError("Err: meaningless penalty term (no encoder)")
            if not vocab_keys:
                raise ValueError("Err: specify a key vocabulary (no encoder)")

        if tied_in_out and num_input_words != num_output_words:
            raise ValueError("Can't tie in and out embeddings. Different "
                             "vocabulary size")
        if shared_rnn and (encoder != 'lstm' or decoder != 'lstm'):
            raise ValueError(
                "can't share RNN because either encoder or decoder"
                "is not an RNN")
        if shared_rnn and decoder == 'lstm_c':
            raise ValueError(
                "can't share RNN because the decoder takes different"
                "inputs")
        if word_dropout < 0 or word_dropout > 1:
            raise ValueError("invalid value for word dropout",
                             str(word_dropout))
        if proximity_distance not in ['l1', 'l2', 'cos']:
            raise ValueError(
                "unrecognized distance: {}".format(proximity_distance))

        if proximity_coef and emb_dim != dim and not translate_layer:
            raise ValueError(
                """if proximity penalisation, emb_dim should equal dim or 
                              there should be a translate layer""")

        if encoder not in [
                None, 'lstm', 'bilstm', 'mean', 'weighted_mean', 'max_bilstm',
                'bilstm_sum', 'max_bilstm_sum'
        ]:
            raise ValueError('encoder not recognized')
        if decoder not in ['skip-gram', 'lstm', 'lstm_c']:
            raise ValueError('decoder not recognized')

        self._proximity_distance = proximity_distance
        self._decoder = decoder
        self._encoder = encoder
        self._num_input_words = num_input_words
        self._num_output_words = num_output_words
        self._vocab = vocab
        self._proximity_coef = proximity_coef
        self._reconstruction_coef = reconstruction_coef
        self._provide_targets = provide_targets

        self._word_to_id = WordToIdOp(self._vocab)
        if vocab_keys:
            self._key_to_id = WordToIdOp(vocab_keys)

        children = []

        if encoder or (not encoder and decoder in ['lstm', 'lstm_c']):
            self._main_lookup = LookupTable(self._num_input_words,
                                            emb_dim,
                                            name='main_lookup')
            children.append(self._main_lookup)
        if provide_targets:
            # this is useful to simulate Hill's baseline without pretrained embeddings
            # in the encoder, only as targets for the encoder.
            self._target_lookup = LookupTable(self._num_input_words,
                                              emb_dim,
                                              name='target_lookup')
            children.append(self._target_lookup)
        if not encoder:
            self._key_lookup = LookupTable(vocab_keys.size(),
                                           emb_dim,
                                           name='key_lookup')
            children.append(self._key_lookup)
        elif encoder == 'lstm':
            self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
            self._encoder_rnn = LSTM(dim, name='encoder_rnn')
            children.extend([self._encoder_fork, self._encoder_rnn])
        elif encoder in ['bilstm', 'max_bilstm']:
            # dim is the dim of the concatenated vector
            self._encoder_fork = Linear(emb_dim, 2 * dim, name='encoder_fork')
            self._encoder_rnn = Bidirectional(LSTM(dim / 2,
                                                   name='encoder_rnn'))
            children.extend([self._encoder_fork, self._encoder_rnn])
        elif encoder in ['bilstm_sum', 'max_bilstm_sum']:
            self._encoder_fork = Linear(emb_dim, 4 * dim, name='encoder_fork')
            self._encoder_rnn = BidirectionalSum(LSTM(dim, name='encoder_rnn'))
            children.extend([self._encoder_fork, self._encoder_rnn])
        elif encoder == 'mean':
            pass
        elif encoder == 'weighted_mean':
            self._encoder_w = MLP([Logistic()], [dim, 1],
                                  name="encoder_weights")
            children.extend([self._encoder_w])
        else:
            raise NotImplementedError()

        if decoder in ['lstm', 'lstm_c']:
            dim_after_translate = emb_dim
            if shared_rnn:
                self._decoder_fork = self._encoder_fork
                self._decoder_rnn = self._encoder_rnn
            else:
                if decoder == 'lstm_c':
                    dim_2 = dim + emb_dim
                else:
                    dim_2 = dim
                self._decoder_fork = Linear(dim_2,
                                            4 * dim,
                                            name='decoder_fork')
                self._decoder_rnn = LSTM(dim, name='decoder_rnn')
            children.extend([self._decoder_fork, self._decoder_rnn])
        elif decoder == 'skip-gram':
            dim_after_translate = emb_dim

        self._translate_layer = None
        activations = {'sigmoid': Logistic(), 'tanh': Tanh(), 'linear': None}

        if translate_layer:
            if type(translate_layer) == str:
                translate_layer = [translate_layer]
            assert (type(translate_layer) == list)
            activations_translate = [activations[a] for a in translate_layer]
            dims_translate = [
                dim,
            ] * len(translate_layer) + [dim_after_translate]
            self._translate_layer = MLP(activations_translate,
                                        dims_translate,
                                        name="translate_layer")
            children.append(self._translate_layer)

        if not self._tied_in_out:
            self._pre_softmax = Linear(emb_dim, self._num_output_words)
            children.append(self._pre_softmax)
        if decoder in ['lstm', 'lstm_c']:
            self._softmax = NDimensionalSoftmax()
        elif decoder in ['skip-gram']:
            self._softmax = Softmax()
        children.append(self._softmax)

        super(Seq2Seq, self).__init__(children=children, **kwargs)

Пример #5

Показать файл

class ESIM(Initializable):
    """
    ESIM model based on https://github.com/NYU-MLL/multiNLI/blob/master/python/models/esim.py
    """

    # seq_length, emb_dim, hidden_dim
    def __init__(
            self,
            dim,
            emb_dim,
            vocab,
            def_emb_translate_dim=-1,
            def_dim=-1,
            encoder='bilstm',
            bn=True,
            def_reader=None,
            def_combiner=None,
            dropout=0.5,
            num_input_words=-1,
            # Others
            **kwargs):

        self._dropout = dropout
        self._vocab = vocab
        self._emb_dim = emb_dim
        self._def_reader = def_reader
        self._def_combiner = def_combiner

        if encoder != 'bilstm':
            raise NotImplementedError()

        if def_emb_translate_dim < 0:
            self.def_emb_translate_dim = emb_dim
        else:
            self.def_emb_translate_dim = def_emb_translate_dim

        if def_dim < 0:
            self._def_dim = emb_dim
        else:
            self._def_dim = def_dim

        if num_input_words > 0:
            logger.info("Restricting vocab to " + str(num_input_words))
            self._num_input_words = num_input_words
        else:
            self._num_input_words = vocab.size()

        children = []

        if self.def_emb_translate_dim != self._emb_dim:
            self._translate_pre_def = Linear(input_dim=emb_dim,
                                             output_dim=def_emb_translate_dim)
            children.append(self._translate_pre_def)
        else:
            self._translate_pre_def = None

        ## Embedding
        self._lookup = LookupTable(self._num_input_words,
                                   emb_dim,
                                   weights_init=GlorotUniform())
        children.append(self._lookup)

        if def_reader:
            self._final_emb_dim = self._def_dim
            self._def_reader = def_reader
            self._def_combiner = def_combiner
            children.extend([self._def_reader, self._def_combiner])
        else:
            self._final_emb_dim = self._emb_dim

        ## BiLSTM
        self._hyp_bidir_fork = Linear(
            self._def_dim if def_reader else self._emb_dim,
            4 * dim,
            name='hyp_bidir_fork')
        self._hyp_bidir = Bidirectional(LSTM(dim), name='hyp_bidir')
        self._prem_bidir_fork = Linear(
            self._def_dim if def_reader else self._emb_dim,
            4 * dim,
            name='prem_bidir_fork')
        self._prem_bidir = Bidirectional(LSTM(dim), name='prem_bidir')
        children.extend([self._hyp_bidir_fork, self._hyp_bidir])
        children.extend([self._prem_bidir, self._prem_bidir_fork])

        ## BiLSTM no. 2 (encoded attentioned embeddings)
        self._hyp_bidir_fork2 = Linear(8 * dim,
                                       4 * dim,
                                       name='hyp_bidir_fork2')
        self._hyp_bidir2 = Bidirectional(LSTM(dim), name='hyp_bidir2')
        self._prem_bidir_fork2 = Linear(8 * dim,
                                        4 * dim,
                                        name='prem_bidir_fork2')
        self._prem_bidir2 = Bidirectional(LSTM(dim), name='prem_bidir2')
        children.extend([self._hyp_bidir_fork2, self._hyp_bidir2])
        children.extend([self._prem_bidir2, self._prem_bidir_fork2])

        self._rnns = [
            self._prem_bidir2, self._hyp_bidir2, self._prem_bidir,
            self._hyp_bidir
        ]

        ## MLP
        if bn:
            self._mlp = BatchNormalizedMLP([Tanh()], [8 * dim, dim],
                                           conserve_memory=False,
                                           name="mlp")
            self._pred = BatchNormalizedMLP([Softmax()], [dim, 3],
                                            conserve_memory=False,
                                            name="pred_mlp")
        else:
            self._mlp = MLP([Tanh()], [8 * dim, dim], name="mlp")
            self._pred = MLP([Softmax()], [dim, 3], name="pred_mlp")

        children.append(self._mlp)
        children.append(self._pred)

        ## Softmax
        self._ndim_softmax = NDimensionalSoftmax()
        children.append(self._ndim_softmax)

        super(ESIM, self).__init__(children=children, **kwargs)

    def get_embeddings_lookups(self):
        return [self._lookup]

    def set_embeddings(self, embeddings):
        self._lookup.parameters[0].set_value(
            embeddings.astype(theano.config.floatX))

    def get_def_embeddings_lookups(self):
        return [self._def_reader._def_lookup]

    def set_def_embeddings(self, embeddings):
        self._def_reader._def_lookup.parameters[0].set_value(
            embeddings.astype(theano.config.floatX))

    @application
    def apply(self,
              application_call,
              s1_preunk,
              s1_mask,
              s2_preunk,
              s2_mask,
              def_mask=None,
              defs=None,
              s1_def_map=None,
              s2_def_map=None,
              train_phase=True):
        # Shortlist words (sometimes we want smaller vocab, especially when dict is small)
        s1 = (tensor.lt(s1_preunk, self._num_input_words) * s1_preunk +
              tensor.ge(s1_preunk, self._num_input_words) * self._vocab.unk)
        s2 = (tensor.lt(s2_preunk, self._num_input_words) * s2_preunk +
              tensor.ge(s2_preunk, self._num_input_words) * self._vocab.unk)

        ### Embed ###

        s1_emb = self._lookup.apply(s1)
        s2_emb = self._lookup.apply(s2)

        application_call.add_auxiliary_variable(1 * s1_emb,
                                                name='s1_word_embeddings')

        if self._def_reader:
            assert defs is not None

            def_embs = self._def_reader.apply(defs, def_mask)

            if self._translate_pre_def:
                logger.info("Translate pre def")
                s1_emb = s1_emb.reshape(
                    (s1_emb.shape[0] * s1_emb.shape[1], s1_emb.shape[2]))
                s2_emb = s2_emb.reshape(
                    (s2_emb.shape[0] * s2_emb.shape[1], s2_emb.shape[2]))
                s1_emb = self._translate_pre_def.apply(s1_emb)
                s2_emb = self._translate_pre_def.apply(s2_emb)
                s1_emb = s1_emb.reshape(
                    (s1_preunk.shape[0], s1_preunk.shape[1], -1))
                s2_emb = s2_emb.reshape(
                    (s2_preunk.shape[0], s2_preunk.shape[1], -1))

            s1_emb = self._def_combiner.apply(s1_emb,
                                              s1_mask,
                                              def_embs,
                                              s1_def_map,
                                              word_ids=s1,
                                              train_phase=train_phase,
                                              call_name="s1")

            s2_emb = self._def_combiner.apply(s2_emb,
                                              s2_mask,
                                              def_embs,
                                              s2_def_map,
                                              word_ids=s2,
                                              train_phase=train_phase,
                                              call_name="s2")
        else:
            if train_phase and self._dropout > 0:
                s1_emb = apply_dropout(s1_emb, drop_prob=self._dropout)
                s2_emb = apply_dropout(s2_emb, drop_prob=self._dropout)

        ### Encode ###

        # TODO: Share this bilstm?
        s1_bilstm, _ = self._prem_bidir.apply(
            flip01(self._prem_bidir_fork.apply(s1_emb)),
            mask=s1_mask.T)  # (batch_size, n_seq, 2 * dim)
        s2_bilstm, _ = self._hyp_bidir.apply(
            flip01(self._hyp_bidir_fork.apply(s2_emb)),
            mask=s2_mask.T)  # (batch_size, n_seq, 2 * dim)
        s1_bilstm = flip01(s1_bilstm)
        s2_bilstm = flip01(s2_bilstm)

        ### Attention ###

        # Compute E matrix (eq. 11)
        # E_ij = <s1[i], s2[j]>
        # each call computes E[i, :]
        def compute_e_row(s2_i, s1_bilstm, s1_mask):
            b_size = s1_bilstm.shape[0]
            # s2_i is (batch_size, emb_dim)
            # s1_bilstm is (batch_size, seq_len, emb_dim)
            # s1_mask is (batch_size, seq_len)
            # s2_i = s2_i.reshape((s2_i.shape[0], s2_i.shape[1], 1))
            s2_i = s2_i.reshape((b_size, s2_i.shape[1], 1))
            s2_i = T.repeat(s2_i, 2, axis=2)
            # s2_i is (batch_size, emb_dim, 2)
            assert s1_bilstm.ndim == 3
            assert s2_i.ndim == 3
            score = T.batched_dot(s1_bilstm, s2_i)  # (batch_size, seq_len, 1)
            score = score[:, :, 0].reshape(
                (b_size, -1))  # (batch_size, seq_len)
            return score  # E[i, :]

        # NOTE: No point in masking here
        E, _ = theano.scan(compute_e_row,
                           sequences=[s1_bilstm.transpose(1, 0, 2)],
                           non_sequences=[s2_bilstm, s2_mask])
        # (seq_len, batch_size, seq_len)
        E = E.dimshuffle(1, 0, 2)
        assert E.ndim == 3

        s2s_att_weights = self._ndim_softmax.apply(E, extra_ndim=1)
        application_call.add_auxiliary_variable(s2s_att_weights.copy(),
                                                name='s2s_att_weights')

        # (batch_size, seq_len, seq_len)

        ### Compute tilde vectors (eq. 12 and 13) ###

        def compute_tilde_vector(e_i, s, s_mask):
            # e_i is (batch_size, seq_len)
            # s_mask is (batch_size, seq_len)
            # s_tilde_i = \sum e_ij b_j, (batch_size, seq_len)
            score = masked_softmax(e_i, s_mask, axis=1)
            score = score.dimshuffle(0, 1, "x")

            s_tilde_i = (score *
                         (s * s_mask.dimshuffle(0, 1, "x"))).sum(axis=1)
            return s_tilde_i

        # (batch_size, seq_len, def_dim)
        s1_tilde, _ = theano.scan(compute_tilde_vector,
                                  sequences=[E.dimshuffle(1, 0, 2)],
                                  non_sequences=[s2_bilstm, s2_mask])
        s1_tilde = s1_tilde.dimshuffle(1, 0, 2)
        s2_tilde, _ = theano.scan(compute_tilde_vector,
                                  sequences=[E.dimshuffle(2, 0, 1)],
                                  non_sequences=[s1_bilstm, s1_mask])
        s2_tilde = s2_tilde.dimshuffle(1, 0, 2)

        ### Compose (eq. 14 and 15) ###

        # (batch_size, seq_len, 8 * dim)
        s1_comp = T.concatenate(
            [s1_bilstm, s1_tilde, s1_bilstm - s1_tilde, s1_bilstm * s1_tilde],
            axis=2)
        s2_comp = T.concatenate(
            [s2_bilstm, s2_tilde, s2_bilstm - s2_tilde, s2_bilstm * s2_tilde],
            axis=2)
        ### Encode (eq. 16 and 17) ###

        # (batch_size, seq_len, 8 * dim)
        # TODO: Share this bilstm?
        s1_comp_bilstm, _ = self._prem_bidir2.apply(
            self._prem_bidir_fork2.apply(flip01(s1_comp)),
            mask=s1_mask.T)  # (batch_size, n_seq, 2 * dim)
        s2_comp_bilstm, _ = self._hyp_bidir2.apply(
            self._hyp_bidir_fork2.apply(flip01(s2_comp)),
            mask=s2_mask.T)  # (batch_size, n_seq, 2 * dim)
        s1_comp_bilstm = flip01(s1_comp_bilstm)
        s2_comp_bilstm = flip01(s2_comp_bilstm)
        ### Pooling Layer ###

        s1_comp_bilstm_ave = (s1_mask.dimshuffle(0, 1, "x") * s1_comp_bilstm).sum(axis=1) \
                            / s1_mask.sum(axis=1).dimshuffle(0, "x")

        s1_comp_bilstm_max = T.max( ((1 - s1_mask.dimshuffle(0, 1, "x")) * -10000) + \
                                    (s1_mask.dimshuffle(0, 1, "x")) * s1_comp_bilstm, axis=1)

        s2_comp_bilstm_ave = (s2_mask.dimshuffle(0, 1, "x") * s2_comp_bilstm).sum(axis=1) \
                             / s2_mask.sum(axis=1).dimshuffle(0, "x")
        # (batch_size, dim)
        s2_comp_bilstm_max = T.max(((1 - s2_mask.dimshuffle(0, 1, "x")) * -10000) + \
                                   (s2_mask.dimshuffle(0, 1, "x")) * s2_comp_bilstm, axis=1)

        ### Final classifier ###

        # MLP layer
        # (batch_size, 8 * dim)
        m = T.concatenate([
            s1_comp_bilstm_ave, s1_comp_bilstm_max, s2_comp_bilstm_ave,
            s2_comp_bilstm_max
        ],
                          axis=1)
        pre_logits = self._mlp.apply(m)

        if train_phase:
            pre_logits = apply_dropout(pre_logits, drop_prob=self._dropout)

        # Get prediction
        self.logits = self._pred.apply(pre_logits)

        return self.logits