Пример #1
0
def build_model(opts,
                overall_maxlen,
                vocab_size=0,
                embedd_dim=50,
                embedding_weights=None,
                verbose=True,
                init_mean_value=None):

    word_input = Input(shape=(overall_maxlen, ),
                       dtype='int32',
                       name='word_input')

    x = Embedding(output_dim=embedd_dim,
                  input_dim=vocab_size,
                  input_length=overall_maxlen,
                  weights=[embedding_weights],
                  mask_zero=True,
                  name='x')(word_input)
    x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x)
    drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout)
    position_embedding = Position_Embedding(name='position_embedding')(drop_x)
    self_att = MutilHeadAttention(nb_head=8, size_per_head=16,
                                  name='self_att')(position_embedding)

    lstm = LSTM(opts.rnn_dim, return_sequences=True, name='lstm')(self_att)
    avg_pooling = GlobalAveragePooling1D(name='avg_pooling')(lstm)

    y = Dense(output_dim=1,
              activation='sigmoid',
              name='y',
              W_regularizer=l2(opts.l2_value))(avg_pooling)

    model = Model(input=[word_input], output=y)

    if opts.init_bias and init_mean_value:

        bias_value = (np.log(init_mean_value) -
                      np.log(1 - init_mean_value)).astype(K.floatx())
        model.layers[-1].b.set_value(bias_value)

    if verbose:
        model.summary()

    optimize = optimizers.rmsprop(lr=0.001)

    model.compile(loss='mse', optimizer=optimize)

    return model
Пример #2
0
def build_model(opts, vocab_size=0, maxnum=50, maxlen=50, embedd_dim=50, embedding_weights=None, verbose=False, init_mean_value=None):
    N = maxnum
    L = maxlen

    p = Input(shape=(4, 2048), dtype='float32', name='p')
    # img_vector = Dense(name='img_vector', units=128)(p)

    word_input = Input(shape=(N * L,), dtype='int32', name='word_input')
    x = Embedding(output_dim=embedd_dim, input_dim=vocab_size, input_length=N * L, weights=embedding_weights,
                  mask_zero=True, trainable=False, name='x')(word_input)
    x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x)
    drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout)

    resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x)

    cnn_e = TimeDistributed(Conv1D(opts.nbfilters, opts.filter1_len, border_mode='valid'), name='cnn_e')(resh_W)

    att_cnn_e = TimeDistributed(Attention(), name='att_cnn_e')(cnn_e)

    lstm_e = LSTM(opts.lstm_units, return_sequences=True, name='lstm_e')(att_cnn_e)

    G = CoAttention(name='essay')([lstm_e, p])
    avg = GlobalAveragePooling1D()(G)
    final_vec_drop = Dropout(rate=0.5, name='final_vec_drop')(avg)
    if opts.l2_value:
        logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value)
        y = Dense(units=1, activation='sigmoid', name='output', W_regularizer=l2(opts.l2_value))(final_vec_drop)
    else:
        y = Dense(units=1, activation='sigmoid', name='output')(final_vec_drop)

    model = Model(input=[word_input, p], output=y)

    if opts.init_bias and init_mean_value:
        logger.info("Initialise output layer bias with log(y_mean/1-y_mean)")
        bias_value = (np.log(init_mean_value) - np.log(1 - init_mean_value)).astype(K.floatx())
        model.layers[-1].b.set_value(bias_value
                                     )

    if verbose:
        model.summary()

    start_time = time.time()
    model.compile(loss='mse', optimizer='adam')
    total_time = time.time() - start_time
    logger.info("Model compiled in %.4f s" % total_time)

    return model
Пример #3
0
def build_hrcnn_model(opts,
                      vocab_size=0,
                      char_vocabsize=0,
                      maxnum=50,
                      maxlen=50,
                      maxcharlen=20,
                      embedd_dim=50,
                      embedding_weights=None,
                      verbose=False,
                      init_mean_value=None):
    # LSTM stacked over CNN based on sentence level
    N = maxnum
    L = maxlen

    logger.info(
        "Model parameters: max_sentnum = %d, max_sentlen = %d, embedding dim = %s, nbfilters = %s, filter1_len = %s, drop rate = %s"
        % (N, L, embedd_dim, opts.nbfilters, opts.filter1_len, opts.dropout))

    word_input = Input(shape=(N * L, ), dtype='int32', name='word_input')
    x = Embedding(output_dim=embedd_dim,
                  input_dim=vocab_size,
                  input_length=N * L,
                  weights=embedding_weights,
                  mask_zero=True,
                  name='x')(word_input)
    x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x)
    drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout)

    resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x)

    # add char-based CNN, concatenating with word embedding to compose word representation
    if opts.use_char:
        char_input = Input(shape=(N * L * maxcharlen, ),
                           dtype='int32',
                           name='char_input')
        xc = Embedding(output_dim=opts.char_embedd_dim,
                       input_dim=char_vocabsize,
                       input_length=N * L * maxcharlen,
                       mask_zero=True,
                       name='xc')(char_input)
        xc_masked = ZeroMaskedEntries(name='xc_masked')(xc)
        drop_xc = Dropout(opts.dropout, name='drop_xc')(xc_masked)
        res_xc = Reshape((N * L, maxcharlen, opts.char_embedd_dim),
                         name='res_xc')(drop_xc)
        cnn_xc = TimeDistributed(Conv1D(opts.char_nbfilters,
                                        opts.filter2_len,
                                        padding='valid'),
                                 name='cnn_xc')(res_xc)
        max_xc = TimeDistributed(GlobalMaxPooling1D(), name='avg_xc')(cnn_xc)
        res_xc2 = Reshape((N, L, opts.char_nbfilters), name='res_xc2')(max_xc)

        w_repr = merge([resh_W, res_xc2], mode='concat', name='w_repr')
        zcnn = TimeDistributed(Conv1D(opts.nbfilters,
                                      opts.filter1_len,
                                      padding='valid'),
                               name='zcnn')(w_repr)
    else:
        zcnn = TimeDistributed(Conv1D(opts.nbfilters,
                                      opts.filter1_len,
                                      padding='valid'),
                               name='zcnn')(resh_W)

    # pooling mode
    if opts.mode == 'mot':
        logger.info("Use mean-over-time pooling on sentence")
        avg_zcnn = TimeDistributed(GlobalAveragePooling1D(),
                                   name='avg_zcnn')(zcnn)
    elif opts.mode == 'att':
        logger.info('Use attention-pooling on sentence')
        avg_zcnn = TimeDistributed(Attention(), name='avg_zcnn')(zcnn)
    elif opts.mode == 'merged':
        logger.info(
            'Use mean-over-time and attention-pooling together on sentence')
        avg_zcnn1 = TimeDistributed(GlobalAveragePooling1D(),
                                    name='avg_zcnn1')(zcnn)
        avg_zcnn2 = TimeDistributed(Attention(), name='avg_zcnn2')(zcnn)
        avg_zcnn = merge([avg_zcnn1, avg_zcnn2],
                         mode='concat',
                         name='avg_zcnn')
    else:
        raise NotImplementedError
    hz_lstm = LSTM(opts.lstm_units, return_sequences=True,
                   name='hz_lstm')(avg_zcnn)

    if opts.mode == 'mot':
        logger.info('Use mean-over-time pooling on text')
        avg_hz_lstm = GlobalAveragePooling1D(name='avg_hz_lstm')(hz_lstm)
    elif opts.mode == 'att':
        logger.info('Use attention-pooling on text')
        avg_hz_lstm = Attention(name='avg_hz_lstm')(hz_lstm)
    elif opts.mode == 'merged':
        logger.info(
            'Use mean-over-time and attention-pooling together on text')
        avg_hz_lstm1 = GlobalAveragePooling1D(name='avg_hz_lstm1')(hz_lstm)
        avg_hz_lstm2 = Attention(name='avg_hz_lstm2')(hz_lstm)
        avg_hz_lstm = merge([avg_hz_lstm1, avg_hz_lstm2],
                            mode='concat',
                            name='avg_hz_lstm')
    else:
        raise NotImplementedError
    if opts.l2_value:
        logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value)
        y = Dense(units=1,
                  activation='sigmoid',
                  name='output',
                  W_regularizer=l2(opts.l2_value))(avg_hz_lstm)
    else:
        y = Dense(units=1, activation='sigmoid', name='output')(avg_hz_lstm)

    if opts.use_char:
        model = Model(inputs=[word_input, char_input], outputs=y)
    else:
        model = Model(inputs=word_input, outputs=y)

    if opts.init_bias and init_mean_value:
        logger.info("Initialise output layer bias with log(y_mean/1-y_mean)")
        bias_value = (np.log(init_mean_value) -
                      np.log(1 - init_mean_value)).astype(K.floatx())
        model.layers[-1].b.set_value(bias_value)

    if verbose:
        model.summary()

    start_time = time.time()
    model.compile(loss='mse', optimizer='rmsprop')
    total_time = time.time() - start_time
    logger.info("Model compiled in %.4f s" % total_time)

    return model
Пример #4
0
def build_model_fusion(opts, vocab_size=0, maxnum=50, maxlen=50, embedd_dim=50, embedding_weights=None, verbose=False, init_mean_value=None):

    # p_input1 = Input(shape=(256, 256, 3), dtype='float32', name='p_input1')
    # p_input2 = Input(shape=(256, 256, 3), dtype='float32', name='p_input2')
    # p_input3 = Input(shape=(256, 256, 3), dtype='float32', name='p_input3')
    # p_input4 = Input(shape=(256, 256, 3), dtype='float32', name='p_input4')
    p = Input(shape=(256, 256, 3), dtype='float32', name='p')
    cnn_model = cnn()
    img = cnn_model(p)
    img = Reshape([6*6, 100])(img)
    # img1 = cnn_model(p_input1)
    # img2 = cnn_model(p_input2)
    # img3 = cnn_model(p_input3)
    # img4 = cnn_model(p_input4)
    # img1 = GlobalMaxPooling2D()(img1)
    # img2 = GlobalMaxPooling2D()(img2)
    # img3 = GlobalMaxPooling2D()(img3)
    # img4 = GlobalMaxPooling2D()(img4)

    # img = concatenate([img1, img2, img3, img4], axis=1)
    # img = Reshape((4, 100))(img)

    N = maxnum
    L = maxlen

    word_input = Input(shape=(N * L,), dtype='int32', name='word_input')
    x = Embedding(output_dim=embedd_dim, input_dim=vocab_size, input_length=N * L, weights=embedding_weights,
                  mask_zero=True, name='x')(word_input)
    x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x)
    drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout)

    resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x)

    cnn_e = TimeDistributed(Conv1D(opts.nbfilters, opts.filter1_len, border_mode='valid', activation='tanh'), name='cnn_e')(resh_W)
    cnn_e = Dropout(rate=0.5)(cnn_e)
    att_cnn_e = TimeDistributed(Attention(), name='att_cnn_e')(cnn_e)
    att_cnn_e = Dropout(rate=0.5)(att_cnn_e)
    lstm_e = LSTM(opts.lstm_units, return_sequences=True, name='lstm_e')(att_cnn_e)
    lstm_e = Dropout(rate=0.5)(lstm_e)
    G = CoAttention(name='essay')([lstm_e, img])
    avg = GlobalAveragePooling1D()(G)
    final_vec_drop = Dropout(rate=0.5, name='final_vec_drop')(avg)

    if opts.l2_value:
        logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value)
        y = Dense(units=1, activation='sigmoid', name='output', W_regularizer=l2(opts.l2_value))(final_vec_drop)
    else:
        y = Dense(units=1, activation='sigmoid', name='output')(final_vec_drop)

    # model = Model(input=[word_input, p_input1, p_input2, p_input3, p_input4], output=y)
    model = Model(input=[word_input, p], output=y)
    if opts.init_bias and init_mean_value:
        logger.info("Initialise output layer bias with log(y_mean/1-y_mean)")
        bias_value = (np.log(init_mean_value) - np.log(1 - init_mean_value)).astype(K.floatx())
        model.layers[-1].b.set_value(bias_value)

    if verbose:
        model.summary()

    start_time = time.time()
    model.compile(loss='mse', optimizer='adam')
    total_time = time.time() - start_time
    logger.info("Model compiled in %.4f s" % total_time)

    return model
Пример #5
0
def build_shrcnn_model(opts,
                       vocab_size=0,
                       char_vocabsize=0,
                       maxnum=50,
                       maxlen=50,
                       maxcnum=50,
                       maxclen=50,
                       maxcharlen=20,
                       embedd_dim=50,
                       embedding_weights=None,
                       verbose=False,
                       init_mean_value=None):
    # LSTM stacked over CNN based on sentence level
    N = maxnum
    L = maxlen

    cN = maxcnum
    cL = maxclen

    logger.info(
        "Model parameters: max_sentnum = %d, max_sentlen = %d, embedding dim = %s, nbfilters = %s, filter1_len = %s, drop rate = %s"
        % (N, L, embedd_dim, opts.nbfilters, opts.filter1_len, opts.dropout))

    word_input = Input(shape=(N * L, ), dtype='int32', name='word_input')
    context_input = Input(shape=(cN * cL, ),
                          dtype='int32',
                          name='context_input')

    emb = Embedding(output_dim=embedd_dim,
                    input_dim=vocab_size,
                    weights=embedding_weights,
                    mask_zero=True,
                    name='cx')
    cx = emb(context_input)
    cx_maskedout = ZeroMaskedEntries(name='cx_maskedout')(cx)
    drop_cx = Dropout(opts.dropout, name='drop_cx')(cx_maskedout)

    resh_C = Reshape((cN, cL, embedd_dim), name='resh_C')(drop_cx)

    czcnn = TimeDistributed(Conv1D(opts.nbfilters,
                                   opts.filter1_len,
                                   padding='valid'),
                            name='czcnn')(resh_C)

    x = emb(word_input)
    x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x)
    drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout)

    resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x)

    # add char-based CNN, concatenating with word embedding to compose word representation
    zcnn = TimeDistributed(Conv1D(opts.nbfilters,
                                  opts.filter1_len,
                                  padding='valid'),
                           name='zcnn')(resh_W)
    '''
    encoded_essay = Reshape((zcnn.shape[1].value*zcnn.shape[2].value, opts.nbfilters))(zcnn)
    encoded_context = Reshape((czcnn.shape[1].value*czcnn.shape[2].value, opts.nbfilters))(czcnn)
    # bidaf
    # Now we compute a similarity between the passage words and the question words, and
    # normalize the matrix in a couple of different ways for input into some more layers.
    matrix_attention_layer = MatrixAttention(name='essay_context_similarity')
    # matrix_attention_layer = LinearMatrixAttention(name='passage_question_similarity')

    # Shape: (batch_size, num_passage_words, num_question_words)
    essay_context_similarity = matrix_attention_layer([encoded_essay, encoded_context])


    # Shape: (batch_size, num_passage_words, num_question_words), normalized over question
    # words for each passage word.
    essay_context_attention = MaskedSoftmax()(essay_context_similarity)
    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    weighted_sum_layer = WeightedSum(name="essay_context_vectors", use_masking=False)
    essay_context_vectors = weighted_sum_layer([encoded_context, essay_context_attention])

    
    # Min's paper finds, for each document word, the most similar question word to it, and
    # computes a single attention over the whole document using these max similarities.
    # Shape: (batch_size, num_passage_words)
    context_essay_similarity = Max(axis=-1)(essay_context_similarity)
    # Shape: (batch_size, num_passage_words)
    context_essay_attention = MaskedSoftmax()(context_essay_similarity)
    # Shape: (batch_size, embedding_dim * 2)
    weighted_sum_layer = WeightedSum(name="question_passage_vector", use_masking=False)
    context_essay_vector = weighted_sum_layer([encoded_essay, context_essay_attention])

    # Then he repeats this question/passage vector for every word in the passage, and uses it
    # as an additional input to the hidden layers above.
    repeat_layer = RepeatLike(axis=1, copy_from_axis=1)
    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    tiled_context_essay_vector = repeat_layer([context_essay_vector, encoded_essay])

    complex_concat_layer = ComplexConcat(combination='1*2,1*3', name='final_merged_passage')
    final_merged_passage = complex_concat_layer([encoded_essay,
                                                 essay_context_vectors,
                                                 tiled_context_essay_vector])
    

    complex_concat_layer = ComplexConcat(combination='1*2', name='final_merged_passage')
    final_merged_passage = complex_concat_layer([encoded_essay,
                                                 essay_context_vectors])


    mcnn = Reshape((zcnn.shape[1].value, zcnn.shape[2].value, opts.nbfilters), name='mcnn')(final_merged_passage)
    '''

    # pooling mode
    if opts.mode == 'mot':
        logger.info("Use mean-over-time pooling on sentence")
        avg_zcnn = TimeDistributed(GlobalAveragePooling1D(),
                                   name='avg_zcnn')(zcnn)
    elif opts.mode == 'att':
        logger.info('Use attention-pooling on sentence')
        avg_zcnn = TimeDistributed(Attention(), name='avg_zcnn')(zcnn)
        avg_czcnn = TimeDistributed(Attention(), name='avg_czcnn')(czcnn)
    elif opts.mode == 'merged':
        logger.info(
            'Use mean-over-time and attention-pooling together on sentence')
        avg_zcnn1 = TimeDistributed(GlobalAveragePooling1D(),
                                    name='avg_zcnn1')(zcnn)
        avg_zcnn2 = TimeDistributed(Attention(), name='avg_zcnn2')(zcnn)
        avg_zcnn = merge([avg_zcnn1, avg_zcnn2],
                         mode='concat',
                         name='avg_zcnn')
    else:
        raise NotImplementedError

    hz_lstm = LSTM(opts.lstm_units, return_sequences=True,
                   name='hz_lstm')(avg_zcnn)
    chz_lstm = LSTM(opts.lstm_units, return_sequences=True,
                    name='chz_lstm')(avg_czcnn)

    if opts.mode == 'mot':
        logger.info('Use mean-over-time pooling on text')
        avg_hz_lstm = GlobalAveragePooling1D(name='avg_hz_lstm')(hz_lstm)
    elif opts.mode == 'att':
        logger.info('Use co-attention on text')

        # PART 2:
        # Now we compute a similarity between the passage words and the question words, and
        # normalize the matrix in a couple of different ways for input into some more layers.
        matrix_attention_layer = MatrixAttention(
            name='essay_context_similarity')
        # Shape: (batch_size, num_passage_words, num_question_words)
        essay_context_similarity = matrix_attention_layer([hz_lstm, chz_lstm])

        # Shape: (batch_size, num_passage_words, num_question_words), normalized over question
        # words for each passage word.
        essay_context_attention = MaskedSoftmax()(essay_context_similarity)
        weighted_sum_layer = WeightedSum(name="essay_context_vectors",
                                         use_masking=False)
        # Shape: (batch_size, num_passage_words, embedding_dim * 2)
        weighted_hz_lstm = weighted_sum_layer(
            [chz_lstm, essay_context_attention])

        # Min's paper finds, for each document word, the most similar question word to it, and
        # computes a single attention over the whole document using these max similarities.
        # Shape: (batch_size, num_passage_words)
        context_essay_similarity = Max(axis=-1)(essay_context_similarity)
        # Shape: (batch_size, num_passage_words)
        context_essay_attention = MaskedSoftmax()(context_essay_similarity)
        # Shape: (batch_size, embedding_dim * 2)
        weighted_sum_layer = WeightedSum(name="context_essay_vector",
                                         use_masking=False)
        context_essay_vector = weighted_sum_layer(
            [hz_lstm, context_essay_attention])

        # Then he repeats this question/passage vector for every word in the passage, and uses it
        # as an additional input to the hidden layers above.
        repeat_layer = RepeatLike(axis=1, copy_from_axis=1)
        # Shape: (batch_size, num_passage_words, embedding_dim * 2)
        tiled_context_essay_vector = repeat_layer(
            [context_essay_vector, hz_lstm])

        complex_concat_layer = ComplexConcat(combination='1,2,1*2,1*3',
                                             name='final_merged_passage')
        final_merged_passage = complex_concat_layer(
            [hz_lstm, weighted_hz_lstm, tiled_context_essay_vector])

        avg_hz_lstm = LSTM(opts.lstm_units,
                           return_sequences=False,
                           name='avg_hz_lstm')(final_merged_passage)

        # avg_hz_lstm = CoAttentionWithoutBi(name='avg_hz_lstm')([hz_lstm, weighted_hz_lstm])

        # avg_hz_lstm = Attention(name='avg_hz_lstm')(hz_lstm)
    elif opts.mode == 'merged':
        logger.info(
            'Use mean-over-time and attention-pooling together on text')
        avg_hz_lstm1 = GlobalAveragePooling1D(name='avg_hz_lstm1')(hz_lstm)
        avg_hz_lstm2 = Attention(name='avg_hz_lstm2')(hz_lstm)
        avg_hz_lstm = merge([avg_hz_lstm1, avg_hz_lstm2],
                            mode='concat',
                            name='avg_hz_lstm')
    else:
        raise NotImplementedError
    if opts.l2_value:
        logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value)
        y = Dense(units=1,
                  activation='sigmoid',
                  name='output',
                  W_regularizer=l2(opts.l2_value))(avg_hz_lstm)
    else:
        y = Dense(units=1, activation='sigmoid', name='output')(avg_hz_lstm)

    model = Model(inputs=[word_input, context_input], outputs=y)

    if opts.init_bias and init_mean_value:
        logger.info("Initialise output layer bias with log(y_mean/1-y_mean)")
        bias_value = (np.log(init_mean_value) -
                      np.log(1 - init_mean_value)).astype(K.floatx())
        model.layers[-1].b.set_value(bias_value)

    if verbose:
        model.summary()

    start_time = time.time()
    model.compile(loss='mse', optimizer='rmsprop')
    total_time = time.time() - start_time
    logger.info("Model compiled in %.4f s" % total_time)

    return model
Пример #6
0
def build_hrcnn_model(opts,
                      vocab_size=0,
                      maxnum=50,
                      maxlen=50,
                      embedd_dim=50,
                      embedding_weights=None,
                      verbose=False,
                      init_mean_value=None):
    # LSTM stacked over CNN based on sentence level
    N = maxnum
    L = maxlen
    print(opts)
    logger.info(
        "Model parameters: max_sentnum = %d, max_sentlen = %d, embedding dim = %s, nbfilters = %s, filter1_len = %s, drop rate = %s"
        % (N, L, embedd_dim, opts.nbfilters, opts.filter1_len, opts.dropout))

    word_input = Input(shape=(N * L, ), dtype='int32', name='word_input')

    # embedding layer
    if opts.use_mask == 0:
        x = Embedding(output_dim=embedd_dim,
                      input_dim=vocab_size,
                      input_length=N * L,
                      weights=embedding_weights,
                      mask_zero=False,
                      name='x')(word_input)
        x_maskedout = x

    elif opts.use_mask == 1:
        x = Embedding(output_dim=embedd_dim,
                      input_dim=vocab_size,
                      input_length=N * L,
                      weights=embedding_weights,
                      mask_zero=True,
                      name='x')(word_input)
        x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x)

    # drop out
    drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout)
    # reshape
    resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x)
    # CNN layer
    zcnn = TimeDistributed(Convolution1D(opts.nbfilters,
                                         opts.filter1_len,
                                         border_mode='valid'),
                           name='zcnn')(resh_W)

    # pooling mode1 on CNN
    if opts.mode1 == 'mot':
        logger.info("Use mean-over-time pooling on sentence")
        avg_zcnn = TimeDistributed(GlobalAveragePooling1D(),
                                   name='avg_zcnn')(zcnn)
    elif opts.mode1 == 'att':
        logger.info('Use attention-pooling on sentence')
        avg_zcnn = TimeDistributed(Attention(), name='avg_zcnn')(zcnn)
    elif opts.mode1 == 'merged':
        logger.info(
            'Use mean-over-time and attention-pooling together on sentence')
        avg_zcnn1 = TimeDistributed(GlobalAveragePooling1D(),
                                    input_shape=(K.int_shape(zcnn)[2],
                                                 K.int_shape(zcnn)[3]),
                                    name='avg_zcnn1')(zcnn)
        avg_zcnn2 = TimeDistributed(Attention(), name='avg_zcnn2')(zcnn)
        avg_zcnn = merge([avg_zcnn1, avg_zcnn2],
                         mode='concat',
                         name='avg_zcnn')
    else:
        raise NotImplementedError

    hz_lstm = LSTM(opts.lstm_units, return_sequences=True,
                   name='hz_lstm')(avg_zcnn)

    # pooling mode1 on LSTM
    if opts.mode2 == 'mot':
        logger.info('Use mean-over-time pooling on text')
        avg_hz_lstm = GlobalAveragePooling1D(name='avg_hz_lstm')(hz_lstm)
    elif opts.mode2 == 'att':
        logger.info('Use attention-pooling on text')
        avg_hz_lstm = Attention(name='avg_hz_lstm')(hz_lstm)
    elif opts.mode2 == 'merged':
        logger.info(
            'Use mean-over-time and attention-pooling together on text')
        avg_hz_lstm1 = GlobalAveragePooling1D(name='avg_hz_lstm1')(hz_lstm)
        avg_hz_lstm2 = Attention(name='avg_hz_lstm2')(hz_lstm)
        avg_hz_lstm = merge([avg_hz_lstm1, avg_hz_lstm2],
                            mode='concat',
                            name='avg_hz_lstm')
    else:
        raise NotImplementedError

    # l2 regularization
    if opts.l2_value:
        logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value)
        y = Dense(output_dim=1,
                  activation='sigmoid',
                  name='output',
                  kernel_regularizer=regularizers.l2(
                      opts.l2_value))(avg_hz_lstm)
    else:
        y = Dense(output_dim=1, activation='sigmoid',
                  name='output')(avg_hz_lstm)

    model = Model(input=word_input, output=y)

    if opts.init_bias and init_mean_value:
        logger.info("Initialise output layer bias with log(y_mean/1-y_mean)")
        bias_value = (np.log(init_mean_value) -
                      np.log(1 - init_mean_value)).astype(K.floatx())
        model.layers[-1].bias = bias_value

    if verbose:
        model.summary()

    start_time = time.time()
    model.compile(loss='mse', optimizer='rmsprop')
    total_time = time.time() - start_time
    logger.info("Model compiled in %.4f s" % total_time)

    return model