Python ZeroMaskedEntries примеры использования

Язык программирования: Python

Пространство имен/Пакет: zeromasking

Класс/Тип: ZeroMaskedEntries

Примеров на hotexamples.com: 6

Python ZeroMaskedEntries - 6 примеров найдено. Это лучшие примеры Python кода для zeromasking.ZeroMaskedEntries, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

ZeroMaskedEntries(6)

Основные методы

ZeroMaskedEntries (6)

Пример #1

Показать файл

def build_model(opts,
                overall_maxlen,
                vocab_size=0,
                embedd_dim=50,
                embedding_weights=None,
                verbose=True,
                init_mean_value=None):

    word_input = Input(shape=(overall_maxlen, ),
                       dtype='int32',
                       name='word_input')

    x = Embedding(output_dim=embedd_dim,
                  input_dim=vocab_size,
                  input_length=overall_maxlen,
                  weights=[embedding_weights],
                  mask_zero=True,
                  name='x')(word_input)
    x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x)
    drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout)
    position_embedding = Position_Embedding(name='position_embedding')(drop_x)
    self_att = MutilHeadAttention(nb_head=8, size_per_head=16,
                                  name='self_att')(position_embedding)

    lstm = LSTM(opts.rnn_dim, return_sequences=True, name='lstm')(self_att)
    avg_pooling = GlobalAveragePooling1D(name='avg_pooling')(lstm)

    y = Dense(output_dim=1,
              activation='sigmoid',
              name='y',
              W_regularizer=l2(opts.l2_value))(avg_pooling)

    model = Model(input=[word_input], output=y)

    if opts.init_bias and init_mean_value:

        bias_value = (np.log(init_mean_value) -
                      np.log(1 - init_mean_value)).astype(K.floatx())
        model.layers[-1].b.set_value(bias_value)

    if verbose:
        model.summary()

    optimize = optimizers.rmsprop(lr=0.001)

    model.compile(loss='mse', optimizer=optimize)

    return model

Пример #2

Показать файл

Файл: hier_networks.py Проект: Chan0717/Muitimodel

def build_model(opts, vocab_size=0, maxnum=50, maxlen=50, embedd_dim=50, embedding_weights=None, verbose=False, init_mean_value=None):
    N = maxnum
    L = maxlen

    p = Input(shape=(4, 2048), dtype='float32', name='p')
    # img_vector = Dense(name='img_vector', units=128)(p)

    word_input = Input(shape=(N * L,), dtype='int32', name='word_input')
    x = Embedding(output_dim=embedd_dim, input_dim=vocab_size, input_length=N * L, weights=embedding_weights,
                  mask_zero=True, trainable=False, name='x')(word_input)
    x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x)
    drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout)

    resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x)

    cnn_e = TimeDistributed(Conv1D(opts.nbfilters, opts.filter1_len, border_mode='valid'), name='cnn_e')(resh_W)

    att_cnn_e = TimeDistributed(Attention(), name='att_cnn_e')(cnn_e)

    lstm_e = LSTM(opts.lstm_units, return_sequences=True, name='lstm_e')(att_cnn_e)

    G = CoAttention(name='essay')([lstm_e, p])
    avg = GlobalAveragePooling1D()(G)
    final_vec_drop = Dropout(rate=0.5, name='final_vec_drop')(avg)
    if opts.l2_value:
        logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value)
        y = Dense(units=1, activation='sigmoid', name='output', W_regularizer=l2(opts.l2_value))(final_vec_drop)
    else:
        y = Dense(units=1, activation='sigmoid', name='output')(final_vec_drop)

    model = Model(input=[word_input, p], output=y)

    if opts.init_bias and init_mean_value:
        logger.info("Initialise output layer bias with log(y_mean/1-y_mean)")
        bias_value = (np.log(init_mean_value) - np.log(1 - init_mean_value)).astype(K.floatx())
        model.layers[-1].b.set_value(bias_value
                                     )

    if verbose:
        model.summary()

    start_time = time.time()
    model.compile(loss='mse', optimizer='adam')
    total_time = time.time() - start_time
    logger.info("Model compiled in %.4f s" % total_time)

    return model

Пример #3

Показать файл

Файл: hier_networks.py Проект: Rokeer/aes

def build_hrcnn_model(opts,
                      vocab_size=0,
                      char_vocabsize=0,
                      maxnum=50,
                      maxlen=50,
                      maxcharlen=20,
                      embedd_dim=50,
                      embedding_weights=None,
                      verbose=False,
                      init_mean_value=None):
    # LSTM stacked over CNN based on sentence level
    N = maxnum
    L = maxlen

    logger.info(
        "Model parameters: max_sentnum = %d, max_sentlen = %d, embedding dim = %s, nbfilters = %s, filter1_len = %s, drop rate = %s"
        % (N, L, embedd_dim, opts.nbfilters, opts.filter1_len, opts.dropout))

    word_input = Input(shape=(N * L, ), dtype='int32', name='word_input')
    x = Embedding(output_dim=embedd_dim,
                  input_dim=vocab_size,
                  input_length=N * L,
                  weights=embedding_weights,
                  mask_zero=True,
                  name='x')(word_input)
    x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x)
    drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout)

    resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x)

    # add char-based CNN, concatenating with word embedding to compose word representation
    if opts.use_char:
        char_input = Input(shape=(N * L * maxcharlen, ),
                           dtype='int32',
                           name='char_input')
        xc = Embedding(output_dim=opts.char_embedd_dim,
                       input_dim=char_vocabsize,
                       input_length=N * L * maxcharlen,
                       mask_zero=True,
                       name='xc')(char_input)
        xc_masked = ZeroMaskedEntries(name='xc_masked')(xc)
        drop_xc = Dropout(opts.dropout, name='drop_xc')(xc_masked)
        res_xc = Reshape((N * L, maxcharlen, opts.char_embedd_dim),
                         name='res_xc')(drop_xc)
        cnn_xc = TimeDistributed(Conv1D(opts.char_nbfilters,
                                        opts.filter2_len,
                                        padding='valid'),
                                 name='cnn_xc')(res_xc)
        max_xc = TimeDistributed(GlobalMaxPooling1D(), name='avg_xc')(cnn_xc)
        res_xc2 = Reshape((N, L, opts.char_nbfilters), name='res_xc2')(max_xc)

        w_repr = merge([resh_W, res_xc2], mode='concat', name='w_repr')
        zcnn = TimeDistributed(Conv1D(opts.nbfilters,
                                      opts.filter1_len,
                                      padding='valid'),
                               name='zcnn')(w_repr)
    else:
        zcnn = TimeDistributed(Conv1D(opts.nbfilters,
                                      opts.filter1_len,
                                      padding='valid'),
                               name='zcnn')(resh_W)

    # pooling mode
    if opts.mode == 'mot':
        logger.info("Use mean-over-time pooling on sentence")
        avg_zcnn = TimeDistributed(GlobalAveragePooling1D(),
                                   name='avg_zcnn')(zcnn)
    elif opts.mode == 'att':
        logger.info('Use attention-pooling on sentence')
        avg_zcnn = TimeDistributed(Attention(), name='avg_zcnn')(zcnn)
    elif opts.mode == 'merged':
        logger.info(
            'Use mean-over-time and attention-pooling together on sentence')
        avg_zcnn1 = TimeDistributed(GlobalAveragePooling1D(),
                                    name='avg_zcnn1')(zcnn)
        avg_zcnn2 = TimeDistributed(Attention(), name='avg_zcnn2')(zcnn)
        avg_zcnn = merge([avg_zcnn1, avg_zcnn2],
                         mode='concat',
                         name='avg_zcnn')
    else:
        raise NotImplementedError
    hz_lstm = LSTM(opts.lstm_units, return_sequences=True,
                   name='hz_lstm')(avg_zcnn)

    if opts.mode == 'mot':
        logger.info('Use mean-over-time pooling on text')
        avg_hz_lstm = GlobalAveragePooling1D(name='avg_hz_lstm')(hz_lstm)
    elif opts.mode == 'att':
        logger.info('Use attention-pooling on text')
        avg_hz_lstm = Attention(name='avg_hz_lstm')(hz_lstm)
    elif opts.mode == 'merged':
        logger.info(
            'Use mean-over-time and attention-pooling together on text')
        avg_hz_lstm1 = GlobalAveragePooling1D(name='avg_hz_lstm1')(hz_lstm)
        avg_hz_lstm2 = Attention(name='avg_hz_lstm2')(hz_lstm)
        avg_hz_lstm = merge([avg_hz_lstm1, avg_hz_lstm2],
                            mode='concat',
                            name='avg_hz_lstm')
    else:
        raise NotImplementedError
    if opts.l2_value:
        logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value)
        y = Dense(units=1,
                  activation='sigmoid',
                  name='output',
                  W_regularizer=l2(opts.l2_value))(avg_hz_lstm)
    else:
        y = Dense(units=1, activation='sigmoid', name='output')(avg_hz_lstm)

    if opts.use_char:
        model = Model(inputs=[word_input, char_input], outputs=y)
    else:
        model = Model(inputs=word_input, outputs=y)

    if opts.init_bias and init_mean_value:
        logger.info("Initialise output layer bias with log(y_mean/1-y_mean)")
        bias_value = (np.log(init_mean_value) -
                      np.log(1 - init_mean_value)).astype(K.floatx())
        model.layers[-1].b.set_value(bias_value)

    if verbose:
        model.summary()

    start_time = time.time()
    model.compile(loss='mse', optimizer='rmsprop')
    total_time = time.time() - start_time
    logger.info("Model compiled in %.4f s" % total_time)

    return model

Пример #4

Показать файл

Файл: hier_networks.py Проект: Chan0717/Muitimodel

def build_model_fusion(opts, vocab_size=0, maxnum=50, maxlen=50, embedd_dim=50, embedding_weights=None, verbose=False, init_mean_value=None):

    # p_input1 = Input(shape=(256, 256, 3), dtype='float32', name='p_input1')
    # p_input2 = Input(shape=(256, 256, 3), dtype='float32', name='p_input2')
    # p_input3 = Input(shape=(256, 256, 3), dtype='float32', name='p_input3')
    # p_input4 = Input(shape=(256, 256, 3), dtype='float32', name='p_input4')
    p = Input(shape=(256, 256, 3), dtype='float32', name='p')
    cnn_model = cnn()
    img = cnn_model(p)
    img = Reshape([6*6, 100])(img)
    # img1 = cnn_model(p_input1)
    # img2 = cnn_model(p_input2)
    # img3 = cnn_model(p_input3)
    # img4 = cnn_model(p_input4)
    # img1 = GlobalMaxPooling2D()(img1)
    # img2 = GlobalMaxPooling2D()(img2)
    # img3 = GlobalMaxPooling2D()(img3)
    # img4 = GlobalMaxPooling2D()(img4)

    # img = concatenate([img1, img2, img3, img4], axis=1)
    # img = Reshape((4, 100))(img)

    N = maxnum
    L = maxlen

    word_input = Input(shape=(N * L,), dtype='int32', name='word_input')
    x = Embedding(output_dim=embedd_dim, input_dim=vocab_size, input_length=N * L, weights=embedding_weights,
                  mask_zero=True, name='x')(word_input)
    x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x)
    drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout)

    resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x)

    cnn_e = TimeDistributed(Conv1D(opts.nbfilters, opts.filter1_len, border_mode='valid', activation='tanh'), name='cnn_e')(resh_W)
    cnn_e = Dropout(rate=0.5)(cnn_e)
    att_cnn_e = TimeDistributed(Attention(), name='att_cnn_e')(cnn_e)
    att_cnn_e = Dropout(rate=0.5)(att_cnn_e)
    lstm_e = LSTM(opts.lstm_units, return_sequences=True, name='lstm_e')(att_cnn_e)
    lstm_e = Dropout(rate=0.5)(lstm_e)
    G = CoAttention(name='essay')([lstm_e, img])
    avg = GlobalAveragePooling1D()(G)
    final_vec_drop = Dropout(rate=0.5, name='final_vec_drop')(avg)

    if opts.l2_value:
        logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value)
        y = Dense(units=1, activation='sigmoid', name='output', W_regularizer=l2(opts.l2_value))(final_vec_drop)
    else:
        y = Dense(units=1, activation='sigmoid', name='output')(final_vec_drop)

    # model = Model(input=[word_input, p_input1, p_input2, p_input3, p_input4], output=y)
    model = Model(input=[word_input, p], output=y)
    if opts.init_bias and init_mean_value:
        logger.info("Initialise output layer bias with log(y_mean/1-y_mean)")
        bias_value = (np.log(init_mean_value) - np.log(1 - init_mean_value)).astype(K.floatx())
        model.layers[-1].b.set_value(bias_value)

    if verbose:
        model.summary()

    start_time = time.time()
    model.compile(loss='mse', optimizer='adam')
    total_time = time.time() - start_time
    logger.info("Model compiled in %.4f s" % total_time)

    return model

Пример #5

Показать файл

def build_shrcnn_model(opts,
                       vocab_size=0,
                       char_vocabsize=0,
                       maxnum=50,
                       maxlen=50,
                       maxcnum=50,
                       maxclen=50,
                       maxcharlen=20,
                       embedd_dim=50,
                       embedding_weights=None,
                       verbose=False,
                       init_mean_value=None):
    # LSTM stacked over CNN based on sentence level
    N = maxnum
    L = maxlen

    cN = maxcnum
    cL = maxclen

    logger.info(
        "Model parameters: max_sentnum = %d, max_sentlen = %d, embedding dim = %s, nbfilters = %s, filter1_len = %s, drop rate = %s"
        % (N, L, embedd_dim, opts.nbfilters, opts.filter1_len, opts.dropout))

    word_input = Input(shape=(N * L, ), dtype='int32', name='word_input')
    context_input = Input(shape=(cN * cL, ),
                          dtype='int32',
                          name='context_input')

    emb = Embedding(output_dim=embedd_dim,
                    input_dim=vocab_size,
                    weights=embedding_weights,
                    mask_zero=True,
                    name='cx')
    cx = emb(context_input)
    cx_maskedout = ZeroMaskedEntries(name='cx_maskedout')(cx)
    drop_cx = Dropout(opts.dropout, name='drop_cx')(cx_maskedout)

    resh_C = Reshape((cN, cL, embedd_dim), name='resh_C')(drop_cx)

    czcnn = TimeDistributed(Conv1D(opts.nbfilters,
                                   opts.filter1_len,
                                   padding='valid'),
                            name='czcnn')(resh_C)

    x = emb(word_input)
    x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x)
    drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout)

    resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x)

    # add char-based CNN, concatenating with word embedding to compose word representation
    zcnn = TimeDistributed(Conv1D(opts.nbfilters,
                                  opts.filter1_len,
                                  padding='valid'),
                           name='zcnn')(resh_W)
    '''
    encoded_essay = Reshape((zcnn.shape[1].value*zcnn.shape[2].value, opts.nbfilters))(zcnn)
    encoded_context = Reshape((czcnn.shape[1].value*czcnn.shape[2].value, opts.nbfilters))(czcnn)
    # bidaf
    # Now we compute a similarity between the passage words and the question words, and
    # normalize the matrix in a couple of different ways for input into some more layers.
    matrix_attention_layer = MatrixAttention(name='essay_context_similarity')
    # matrix_attention_layer = LinearMatrixAttention(name='passage_question_similarity')

    # Shape: (batch_size, num_passage_words, num_question_words)
    essay_context_similarity = matrix_attention_layer([encoded_essay, encoded_context])


    # Shape: (batch_size, num_passage_words, num_question_words), normalized over question
    # words for each passage word.
    essay_context_attention = MaskedSoftmax()(essay_context_similarity)
    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    weighted_sum_layer = WeightedSum(name="essay_context_vectors", use_masking=False)
    essay_context_vectors = weighted_sum_layer([encoded_context, essay_context_attention])

    
    # Min's paper finds, for each document word, the most similar question word to it, and
    # computes a single attention over the whole document using these max similarities.
    # Shape: (batch_size, num_passage_words)
    context_essay_similarity = Max(axis=-1)(essay_context_similarity)
    # Shape: (batch_size, num_passage_words)
    context_essay_attention = MaskedSoftmax()(context_essay_similarity)
    # Shape: (batch_size, embedding_dim * 2)
    weighted_sum_layer = WeightedSum(name="question_passage_vector", use_masking=False)
    context_essay_vector = weighted_sum_layer([encoded_essay, context_essay_attention])

    # Then he repeats this question/passage vector for every word in the passage, and uses it
    # as an additional input to the hidden layers above.
    repeat_layer = RepeatLike(axis=1, copy_from_axis=1)
    # Shape: (batch_size, num_passage_words, embedding_dim * 2)
    tiled_context_essay_vector = repeat_layer([context_essay_vector, encoded_essay])

    complex_concat_layer = ComplexConcat(combination='1*2,1*3', name='final_merged_passage')
    final_merged_passage = complex_concat_layer([encoded_essay,
                                                 essay_context_vectors,
                                                 tiled_context_essay_vector])
    

    complex_concat_layer = ComplexConcat(combination='1*2', name='final_merged_passage')
    final_merged_passage = complex_concat_layer([encoded_essay,
                                                 essay_context_vectors])


    mcnn = Reshape((zcnn.shape[1].value, zcnn.shape[2].value, opts.nbfilters), name='mcnn')(final_merged_passage)
    '''

    # pooling mode
    if opts.mode == 'mot':
        logger.info("Use mean-over-time pooling on sentence")
        avg_zcnn = TimeDistributed(GlobalAveragePooling1D(),
                                   name='avg_zcnn')(zcnn)
    elif opts.mode == 'att':
        logger.info('Use attention-pooling on sentence')
        avg_zcnn = TimeDistributed(Attention(), name='avg_zcnn')(zcnn)
        avg_czcnn = TimeDistributed(Attention(), name='avg_czcnn')(czcnn)
    elif opts.mode == 'merged':
        logger.info(
            'Use mean-over-time and attention-pooling together on sentence')
        avg_zcnn1 = TimeDistributed(GlobalAveragePooling1D(),
                                    name='avg_zcnn1')(zcnn)
        avg_zcnn2 = TimeDistributed(Attention(), name='avg_zcnn2')(zcnn)
        avg_zcnn = merge([avg_zcnn1, avg_zcnn2],
                         mode='concat',
                         name='avg_zcnn')
    else:
        raise NotImplementedError

    hz_lstm = LSTM(opts.lstm_units, return_sequences=True,
                   name='hz_lstm')(avg_zcnn)
    chz_lstm = LSTM(opts.lstm_units, return_sequences=True,
                    name='chz_lstm')(avg_czcnn)

    if opts.mode == 'mot':
        logger.info('Use mean-over-time pooling on text')
        avg_hz_lstm = GlobalAveragePooling1D(name='avg_hz_lstm')(hz_lstm)
    elif opts.mode == 'att':
        logger.info('Use co-attention on text')

        # PART 2:
        # Now we compute a similarity between the passage words and the question words, and
        # normalize the matrix in a couple of different ways for input into some more layers.
        matrix_attention_layer = MatrixAttention(
            name='essay_context_similarity')
        # Shape: (batch_size, num_passage_words, num_question_words)
        essay_context_similarity = matrix_attention_layer([hz_lstm, chz_lstm])

        # Shape: (batch_size, num_passage_words, num_question_words), normalized over question
        # words for each passage word.
        essay_context_attention = MaskedSoftmax()(essay_context_similarity)
        weighted_sum_layer = WeightedSum(name="essay_context_vectors",
                                         use_masking=False)
        # Shape: (batch_size, num_passage_words, embedding_dim * 2)
        weighted_hz_lstm = weighted_sum_layer(
            [chz_lstm, essay_context_attention])

        # Min's paper finds, for each document word, the most similar question word to it, and
        # computes a single attention over the whole document using these max similarities.
        # Shape: (batch_size, num_passage_words)
        context_essay_similarity = Max(axis=-1)(essay_context_similarity)
        # Shape: (batch_size, num_passage_words)
        context_essay_attention = MaskedSoftmax()(context_essay_similarity)
        # Shape: (batch_size, embedding_dim * 2)
        weighted_sum_layer = WeightedSum(name="context_essay_vector",
                                         use_masking=False)
        context_essay_vector = weighted_sum_layer(
            [hz_lstm, context_essay_attention])

        # Then he repeats this question/passage vector for every word in the passage, and uses it
        # as an additional input to the hidden layers above.
        repeat_layer = RepeatLike(axis=1, copy_from_axis=1)
        # Shape: (batch_size, num_passage_words, embedding_dim * 2)
        tiled_context_essay_vector = repeat_layer(
            [context_essay_vector, hz_lstm])

        complex_concat_layer = ComplexConcat(combination='1,2,1*2,1*3',
                                             name='final_merged_passage')
        final_merged_passage = complex_concat_layer(
            [hz_lstm, weighted_hz_lstm, tiled_context_essay_vector])

        avg_hz_lstm = LSTM(opts.lstm_units,
                           return_sequences=False,
                           name='avg_hz_lstm')(final_merged_passage)

        # avg_hz_lstm = CoAttentionWithoutBi(name='avg_hz_lstm')([hz_lstm, weighted_hz_lstm])

        # avg_hz_lstm = Attention(name='avg_hz_lstm')(hz_lstm)
    elif opts.mode == 'merged':
        logger.info(
            'Use mean-over-time and attention-pooling together on text')
        avg_hz_lstm1 = GlobalAveragePooling1D(name='avg_hz_lstm1')(hz_lstm)
        avg_hz_lstm2 = Attention(name='avg_hz_lstm2')(hz_lstm)
        avg_hz_lstm = merge([avg_hz_lstm1, avg_hz_lstm2],
                            mode='concat',
                            name='avg_hz_lstm')
    else:
        raise NotImplementedError
    if opts.l2_value:
        logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value)
        y = Dense(units=1,
                  activation='sigmoid',
                  name='output',
                  W_regularizer=l2(opts.l2_value))(avg_hz_lstm)
    else:
        y = Dense(units=1, activation='sigmoid', name='output')(avg_hz_lstm)

    model = Model(inputs=[word_input, context_input], outputs=y)

    if opts.init_bias and init_mean_value:
        logger.info("Initialise output layer bias with log(y_mean/1-y_mean)")
        bias_value = (np.log(init_mean_value) -
                      np.log(1 - init_mean_value)).astype(K.floatx())
        model.layers[-1].b.set_value(bias_value)

    if verbose:
        model.summary()

    start_time = time.time()
    model.compile(loss='mse', optimizer='rmsprop')
    total_time = time.time() - start_time
    logger.info("Model compiled in %.4f s" % total_time)

    return model

Пример #6

Показать файл

def build_hrcnn_model(opts,
                      vocab_size=0,
                      maxnum=50,
                      maxlen=50,
                      embedd_dim=50,
                      embedding_weights=None,
                      verbose=False,
                      init_mean_value=None):
    # LSTM stacked over CNN based on sentence level
    N = maxnum
    L = maxlen
    print(opts)
    logger.info(
        "Model parameters: max_sentnum = %d, max_sentlen = %d, embedding dim = %s, nbfilters = %s, filter1_len = %s, drop rate = %s"
        % (N, L, embedd_dim, opts.nbfilters, opts.filter1_len, opts.dropout))

    word_input = Input(shape=(N * L, ), dtype='int32', name='word_input')

    # embedding layer
    if opts.use_mask == 0:
        x = Embedding(output_dim=embedd_dim,
                      input_dim=vocab_size,
                      input_length=N * L,
                      weights=embedding_weights,
                      mask_zero=False,
                      name='x')(word_input)
        x_maskedout = x

    elif opts.use_mask == 1:
        x = Embedding(output_dim=embedd_dim,
                      input_dim=vocab_size,
                      input_length=N * L,
                      weights=embedding_weights,
                      mask_zero=True,
                      name='x')(word_input)
        x_maskedout = ZeroMaskedEntries(name='x_maskedout')(x)

    # drop out
    drop_x = Dropout(opts.dropout, name='drop_x')(x_maskedout)
    # reshape
    resh_W = Reshape((N, L, embedd_dim), name='resh_W')(drop_x)
    # CNN layer
    zcnn = TimeDistributed(Convolution1D(opts.nbfilters,
                                         opts.filter1_len,
                                         border_mode='valid'),
                           name='zcnn')(resh_W)

    # pooling mode1 on CNN
    if opts.mode1 == 'mot':
        logger.info("Use mean-over-time pooling on sentence")
        avg_zcnn = TimeDistributed(GlobalAveragePooling1D(),
                                   name='avg_zcnn')(zcnn)
    elif opts.mode1 == 'att':
        logger.info('Use attention-pooling on sentence')
        avg_zcnn = TimeDistributed(Attention(), name='avg_zcnn')(zcnn)
    elif opts.mode1 == 'merged':
        logger.info(
            'Use mean-over-time and attention-pooling together on sentence')
        avg_zcnn1 = TimeDistributed(GlobalAveragePooling1D(),
                                    input_shape=(K.int_shape(zcnn)[2],
                                                 K.int_shape(zcnn)[3]),
                                    name='avg_zcnn1')(zcnn)
        avg_zcnn2 = TimeDistributed(Attention(), name='avg_zcnn2')(zcnn)
        avg_zcnn = merge([avg_zcnn1, avg_zcnn2],
                         mode='concat',
                         name='avg_zcnn')
    else:
        raise NotImplementedError

    hz_lstm = LSTM(opts.lstm_units, return_sequences=True,
                   name='hz_lstm')(avg_zcnn)

    # pooling mode1 on LSTM
    if opts.mode2 == 'mot':
        logger.info('Use mean-over-time pooling on text')
        avg_hz_lstm = GlobalAveragePooling1D(name='avg_hz_lstm')(hz_lstm)
    elif opts.mode2 == 'att':
        logger.info('Use attention-pooling on text')
        avg_hz_lstm = Attention(name='avg_hz_lstm')(hz_lstm)
    elif opts.mode2 == 'merged':
        logger.info(
            'Use mean-over-time and attention-pooling together on text')
        avg_hz_lstm1 = GlobalAveragePooling1D(name='avg_hz_lstm1')(hz_lstm)
        avg_hz_lstm2 = Attention(name='avg_hz_lstm2')(hz_lstm)
        avg_hz_lstm = merge([avg_hz_lstm1, avg_hz_lstm2],
                            mode='concat',
                            name='avg_hz_lstm')
    else:
        raise NotImplementedError

    # l2 regularization
    if opts.l2_value:
        logger.info("Use l2 regularizers, l2 value = %s" % opts.l2_value)
        y = Dense(output_dim=1,
                  activation='sigmoid',
                  name='output',
                  kernel_regularizer=regularizers.l2(
                      opts.l2_value))(avg_hz_lstm)
    else:
        y = Dense(output_dim=1, activation='sigmoid',
                  name='output')(avg_hz_lstm)

    model = Model(input=word_input, output=y)

    if opts.init_bias and init_mean_value:
        logger.info("Initialise output layer bias with log(y_mean/1-y_mean)")
        bias_value = (np.log(init_mean_value) -
                      np.log(1 - init_mean_value)).astype(K.floatx())
        model.layers[-1].bias = bias_value

    if verbose:
        model.summary()

    start_time = time.time()
    model.compile(loss='mse', optimizer='rmsprop')
    total_time = time.time() - start_time
    logger.info("Model compiled in %.4f s" % total_time)

    return model