示例#1
0
	def __init__(self, output_dim, hidden_dim, output_length, depth=1,bidirectional=True, dropout=0.25, **kwargs):
		if bidirectional and hidden_dim % 2 != 0:
			raise Exception ("hidden_dim for AttentionSeq2seq should be even (Because of bidirectional RNN).")
		super(AttentionSeq2seq, self).__init__()
		if type(depth) not in [list, tuple]:
			depth = (depth, depth)
		if bidirectional:
			encoder = Bidirectional(LSTMEncoder(output_dim=hidden_dim / 2, state_input=False, return_sequences=True, **kwargs))
		else:
			encoder = LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs)
		decoder = AttentionDecoder(hidden_dim=hidden_dim, output_length=output_length, state_input=False, **kwargs)
		lstms = []
		for i in range(1, depth[0]):
			if bidirectional:
				layer = Bidirectional(LSTMEncoder(output_dim=hidden_dim / 2, state_input=False, return_sequences=True, **kwargs))
			else:
				layer = LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs)
			self.add(layer)
			lstms.append(layer)
			self.add(Dropout(dropout))
		self.add(encoder)
		self.add(Dropout(dropout))
		self.add(TimeDistributedDense(hidden_dim if depth[1] > 1 else output_dim))
		lstms.append(encoder)
		self.add(decoder)
		lstms = [decoder]
		for i in range(1, depth[1]):
			layer = LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs)
			self.add(layer)
			lstms.append(layer)
			self.add(Dropout(dropout))
		if depth[1] > 1:
			self.add(TimeDistributedDense(output_dim))
		self.encoder = encoder
		self.decoder = decoder
示例#2
0
def buildLSTMModel(layerCount, input_dim, added_dim):
    model = Sequential()
    model.add(TimeDistributedDense(input_dim=input_dim, output_dim=input_dim + added_dim))
    for lcount in range(layerCount):
        model.add(LSTM(input_dim=input_dim + added_dim, output_dim=input_dim + added_dim, return_sequences=True))
    model.add(TimeDistributedDense(input_dim=input_dim + added_dim, output_dim=input_dim))
    return model
示例#3
0
def getModel(LSTM_HIDDEN_STATES=300,
             FIRST_DROPOUT=0.0,
             SECOND_DROPOUT=0.0,
             DENSE_LAYERS_SIZE=300,
             seqLength=64,
             word_data_dim=100,
             char_data_dim=43,
             optimizer='rmsprop',
             lr=0.001):

    nb_classes = 2
    nb_filters = 10

    decoder = Graph()

    decoder.add_input(name='input1',
                      input_shape=(seqLength, word_data_dim),
                      dtype='float')
    decoder.add_input(name='input2',
                      input_shape=(seqLength, char_data_dim),
                      dtype='float')

    decoder.add_node(Dropout(0.0),
                     name='mergedInput',
                     inputs=['input1', 'input2'])
    #decoder.add_node(Masking(mask_value=0.,), input = 'mergedInput', name='maskedInput')
    decoder.add_node(LSTM(LSTM_HIDDEN_STATES, return_sequences=True),
                     input='mergedInput',
                     name='LSTMForward')
    decoder.add_node(LSTM(LSTM_HIDDEN_STATES,
                          return_sequences=True,
                          go_backwards=True),
                     input='mergedInput',
                     name='LSTMBackward')
    decoder.add_node(Dropout(FIRST_DROPOUT),
                     name='firstDropout',
                     inputs=['LSTMForward', 'LSTMBackward'])
    decoder.add_node(TimeDistributedDense(DENSE_LAYERS_SIZE,
                                          activation='relu'),
                     name='tdd1',
                     input='firstDropout')
    decoder.add_node(Dropout(SECOND_DROPOUT),
                     name='secondDropout',
                     input='tdd1')
    decoder.add_node(TimeDistributedDense(nb_classes, activation='softmax'),
                     input='secondDropout',
                     name='tdd2')
    decoder.add_output(name='output', input='tdd2')

    if optimizer == 'rmsprop':
        optimizer = RMSprop(lr)
    elif optimizer == 'sgd':
        optimizer = SGD(lr)

    decoder.compile(optimizer, {'output': 'categorical_crossentropy'},
                    metrics=['accuracy'])

    return decoder
示例#4
0
def get_outputs(X_batch):
    l2_reg = 0.01
    stim_shape = (numTime, 40, 50, 50)
    RMSmod = RMSprop(lr=0.001, rho=0.99, epsilon=1e-6)
    num_filters = (8, 16)
    filter_size = (13, 13)
    weight_init = 'he_normal'
    batchsize = 100
    model = Sequential()
    # first convolutional layer
    model.add(TimeDistributedConvolution2D(num_filters[0], filter_size[0], filter_size[1],
                                 input_shape=stim_shape,
                                 border_mode='same', subsample=(1,1),
                                 W_regularizer=l2(l2_reg)))

    #Add relu activation separately for threshold visualizations
    model.add(Activation('relu'))
    # max pooling layer
    model.add(TimeDistributedMaxPooling2D(pool_size=(2, 2), ignore_border=True))

    # flatten
    model.add(TimeDistributedFlatten())

    # Add dense (affine) layer with relu activation
    model.add(TimeDistributedDense(num_filters[1], W_regularizer=l2(l2_reg), activation='relu'))
    # Add LSTM, forget gate bias automatically initialized to 1, default weight initializations recommended
    model.add(LSTM(100*num_filters[1], return_sequences=True))

    # # Add a final dense (affine) layer with softplus activation
    model.add(TimeDistributedDense(1, init=weight_init, W_regularizer=l2(l2_reg), activation='softplus'))
    model.compile(loss='poisson_loss', optimizer=RMSmod)
    model.load_weights(weights_dir)
    if not memories:
        get_outputs = theano.function([model.layers[0].input], model.layers[5].get_output(train=False))
        outputs = get_outputs(X_batch)
    else:
        model2 = Sequential()
        model2.add(TimeDistributedConvolution2D(num_filters[0], filter_size[0], filter_size[1],
                                 input_shape=stim_shape, weights=model.layers[0].get_weights(),
                                 border_mode='same', subsample=(1,1),
                                 W_regularizer=l2(l2_reg)))

        #Add relu activation separately for threshold visualizations
        model2.add(Activation('relu'))
        # max pooling layer
        model2.add(TimeDistributedMaxPooling2D(pool_size=(2, 2), ignore_border=True))

        # flatten
        model2.add(TimeDistributedFlatten())

        # Add dense (affine) layer with relu activation
        model2.add(TimeDistributedDense(num_filters[1], weights=model.layers[4].get_weights(), W_regularizer=l2(l2_reg), activation='relu'))
        # Add LSTM, forget gate bias automatically initialized to 1, default weight initializations recommended
        model2.add(LSTMMem(100*num_filters[1], weights=model.layers[5].get_weights(), return_memories=True))
        model2.compile(loss='poisson_loss', optimizer=RMSmod)
        get_outputs = theano.function([model2.layers[0].input], model2.layers[5].get_output(train=False))
        outputs = get_outputs(X_batch)
    return outputs
示例#5
0
    def __init__(self, input_dim, input_length, output_dim, init='glorot_uniform', inner_init='orthogonal', forget_bias_init='one', activation='tanh', inner_activation='hard_sigmoid',
                 weights=None, truncate_gradient=-1,
                  hidden_state=None, batch_size=None, depth=1, remember_state=False,
                 inner_return_sequences=True, return_sequences=True):

        if not weights:
            weights = [None]*5#No weights for merge layer
        if not hidden_state:
            hidden_state = [None]*6
        super(StatefulContainer, self).__init__()

        forward = DeepLSTM(input_dim=input_dim*2, output_dim=output_dim,
                        input_length=input_length,
                        weights=weights[2], hidden_state=hidden_state[2],
                        batch_size=batch_size,depth=depth, remember_state=remember_state,
                        inner_return_sequences=inner_return_sequences,
                        return_sequences=return_sequences, init='glorot_uniform', 
                        inner_init='orthogonal', forget_bias_init='one',
                        activation='tanh', inner_activation='hard_sigmoid')

        reverse = DeepLSTM(input_dim=input_dim*2, output_dim=output_dim,
                        input_length=input_length,
                        weights=weights[3], hidden_state=hidden_state[3],
                        batch_size=batch_size,depth=depth, remember_state=remember_state,
                        inner_return_sequences=inner_return_sequences,
                        return_sequences=return_sequences, init='glorot_uniform', 
                        inner_init='orthogonal', forget_bias_init='one',
                        activation='tanh', inner_activation='hard_sigmoid', go_backwards=True)

        #A common input to both forward and reverse LSTMs
        #This layer learns a direction invariant representation of your input data
        self.add(TimeDistributedDense(input_dim=input_dim, output_dim=input_dim*2, 
                                input_length=input_length))

        if weights[0]:
            self.layers[0].set_weights(weights[0])

        self.add(Dropout(0.7))
        self.add(forward)
        self.add(reverse)
        reverse.set_previous(forward.layers[0].previous)#Woah!
        merge = Merge([forward, reverse], mode='concat', concat_axis=-1)
        layers = self.layers[:2]
        for l in layers:
            params, regs, consts, updates = l.get_params()
            merge.regularizers += regs
            merge.updates += updates
            for p, c in zip(params, consts):
                if p not in merge.params:
                    merge.params.append(p)
                    merge.constraints.append(c)
        self.add(merge)
        if return_sequences:
            self.add(TimeDistributedDense(output_dim))
        else:
            self.add(Dense(output_dim))
示例#6
0
def create_model(vocab_size, args):
    if args.rnn == 'GRU':
        RNN = recurrent.GRU
    elif args.rnn == 'LSTM':
        RNN = recurrent.LSTM
    else:
        assert False, "Invalid RNN"

    if args.bidirectional:
        model = Graph()
        model.add_input(name="input",
                        batch_input_shape=(args.batch_size, 1),
                        dtype="uint")
        model.add_node(Embedding(vocab_size, args.embed_size, mask_zero=True),
                       name="embed",
                       input='input')
        for i in xrange(args.layers):
            model.add_node(
                RNN(args.hidden_size, return_sequences=True),
                name='forward' + str(i + 1),
                input='embed' if i == 0 else 'dropout' +
                str(i) if args.dropout > 0 else None,
                inputs=['forward' + str(i), 'backward' +
                        str(i)] if i > 0 and args.dropout == 0 else [])
            model.add_node(
                RNN(args.hidden_size, return_sequences=True,
                    go_backwards=True),
                name='backward' + str(i + 1),
                input='embed' if i == 0 else 'dropout' +
                str(i) if args.dropout > 0 else None,
                inputs=['forward' + str(i), 'backward' +
                        str(i)] if i > 0 and args.dropout == 0 else [])
            if args.dropout > 0:
                model.add_node(
                    Dropout(args.dropout),
                    name='dropout' + str(i + 1),
                    inputs=['forward' + str(i + 1), 'backward' + str(i + 1)])
        model.add_node(
            TimeDistributedDense(vocab_size, activation="softmax"),
            name="softmax",
            input='dropout' + str(args.layers) if args.dropout > 0 else None,
            inputs=[
                'forward' + str(args.layers), 'backward' + str(args.layers)
            ] if args.dropout == 0 else [])
        model.add_output(name='output', input="softmax")
    else:
        model = Sequential()
        model.add(Embedding(vocab_size, args.embed_size, mask_zero=True))
        for i in xrange(args.layers):
            model.add(RNN(args.hidden_size, return_sequences=True))
            if args.dropout > 0:
                model.add(Dropout(args.dropout))
        model.add(TimeDistributedDense(vocab_size, activation="softmax"))

    return model
示例#7
0
def build_model(args):
    np.random.seed(args.seed)

    graph = Graph()

    graph.add_input('input', input_shape=(args.input_width, ), dtype='int')

    graph.add_node(build_embedding_layer(args),
                   input='input',
                   name='embedding')

    graph.add_node(LSTM(args.n_units,
                        truncate_gradient=args.truncate_gradient,
                        return_sequences=True),
                   input='embedding',
                   name='lstm0')

    graph.add_node(LSTM(args.n_units,
                        truncate_gradient=args.truncate_gradient,
                        return_sequences=True),
                   input='lstm0',
                   name='lstm1')

    # Attention module.
    graph.add_node(TimeDistributedDense(args.n_units, activation='relu'),
                   input='lstm1',
                   name='attention0')
    graph.add_node(TimeDistributedDense(args.n_units, activation='relu'),
                   input='attention0',
                   name='attention1')
    graph.add_node(TimeDistributedDense(args.n_units, activation='softmax'),
                   input='attention1',
                   name='attention2')

    # Apply mask from output of attention module to LSTM output.
    graph.add_node(TimeDistributedMerge(mode='sum'),
                   inputs=['lstm1', 'attention2'],
                   name='applyattn',
                   merge_mode='mul')

    graph.add_node(Dense(args.n_classes, activation='softmax'),
                   input='applyattn',
                   name='softmax')

    graph.add_output(input='softmax', name='output')

    load_weights(args, graph)

    optimizer = build_optimizer(args)

    graph.compile(loss={'output': args.loss}, optimizer=optimizer)

    return graph
示例#8
0
def build_model(glove, vocab, module_prep_model, c):
    s0pad = s1pad = c['spad']
    max_sentences = c['max_sentences']
    rnn_dim = 1
    print('Model')
    model = Graph()
    # ===================== inputs of size (batch_size, max_sentences, s_pad)
    model.add_input('si03d', (max_sentences, s0pad),
                    dtype=int)  # XXX: cannot be cast to int->problem?
    model.add_input('si13d', (max_sentences, s1pad), dtype=int)
    if True:  # TODO: if flags
        model.add_input('f04d', (max_sentences, s0pad, nlp.flagsdim))
        model.add_input('f14d', (max_sentences, s1pad, nlp.flagsdim))
        model.add_node(Reshape_((s0pad, nlp.flagsdim)), 'f0', input='f04d')
        model.add_node(Reshape_((s1pad, nlp.flagsdim)), 'f1', input='f14d')

    # ===================== reshape to (batch_size * max_sentences, s_pad)
    model.add_node(Reshape_((s0pad, )), 'si0', input='si03d')
    model.add_node(Reshape_((s1pad, )), 'si1', input='si13d')

    # ===================== outputs from sts
    _prep_model(model, glove, vocab, module_prep_model, c, c['oact'], s0pad,
                s1pad, rnn_dim)  # out = ['scoreS1', 'scoreS2']
    # ===================== reshape (batch_size * max_sentences,) -> (batch_size, max_sentences, rnn_dim)
    model.add_node(Reshape_((max_sentences, rnn_dim)),
                   'sts_in1',
                   input='scoreS1')
    model.add_node(Reshape_((max_sentences, rnn_dim)),
                   'sts_in2',
                   input='scoreS2')

    # ===================== [w_full_dim, q_full_dim] -> [class, rel]
    model.add_node(TimeDistributedDense(1,
                                        activation='sigmoid',
                                        W_regularizer=l2(c['l2reg']),
                                        b_regularizer=l2(c['l2reg'])),
                   'c',
                   input='sts_in1')
    model.add_node(TimeDistributedDense(1,
                                        activation='sigmoid',
                                        W_regularizer=l2(c['l2reg']),
                                        b_regularizer=l2(c['l2reg'])),
                   'r',
                   input='sts_in2')

    model.add_node(SumMask(), 'mask', input='si03d')
    # ===================== mean of class over rel
    model.add_node(WeightedMean(max_sentences=max_sentences),
                   name='weighted_mean',
                   inputs=['c', 'r', 'mask'])
    model.add_output(name='score', input='weighted_mean')
    return model
示例#9
0
def build(max_len, embedding_dim, word2id_size, skipgram_offsets, pos2id_size, pdtbmark2id_size, pdtbpair2id_size, pdtbpair_offsets):

    model = Graph()
    loss = {}

    # input: word ids with masked post-padding (doc, time_pad)
    model.add_input(name='x_word_pad', input_shape=(None,), dtype='int')

    # input: word ids with random post-padding (doc, time_pad)
    model.add_input(name='x_word_rand', input_shape=(None,), dtype='int')

    # shared 1: word embedding layer (doc, time_pad, emb)
    model.add_node(Embedding(word2id_size, embedding_dim, input_length=max_len, init='glorot_uniform'), name='shared_1', input='x_word_pad')
    #XXX: mask_zero=True

    # shared 2: bidirectional GRU full sequence layer (doc, time_pad, repr)
    model.add_node(GRU(embedding_dim, return_sequences=True, activation='sigmoid', inner_activation='sigmoid', init='he_uniform', inner_init='orthogonal'), name='shared_2_fwd', input='shared_1')
    model.add_node(GRU(embedding_dim, return_sequences=True, activation='sigmoid', inner_activation='sigmoid', init='he_uniform', inner_init='orthogonal', go_backwards=True), name='shared_2_bck', input='shared_1')
    model.add_node(TimeDistributedDense(embedding_dim, init='he_uniform'), name='shared_2', inputs=['shared_1', 'shared_2_fwd', 'shared_2_bck'], merge_mode='concat')

    # shared 3: bidirectional GRU full sequence layer (doc, time_pad, repr)
    model.add_node(GRU(embedding_dim, return_sequences=True, activation='sigmoid', inner_activation='sigmoid', init='he_uniform', inner_init='orthogonal'), name='shared_3_fwd', input='shared_2')
    model.add_node(GRU(embedding_dim, return_sequences=True, activation='sigmoid', inner_activation='sigmoid', init='he_uniform', inner_init='orthogonal', go_backwards=True), name='shared_3_bck', input='shared_2')
    model.add_node(TimeDistributedDense(embedding_dim, init='he_uniform'), name='shared_3', inputs=['shared_2', 'shared_3_fwd', 'shared_3_bck'], merge_mode='concat')

    # shared 4: bidirectional GRU full sequence layer (doc, time_pad, repr)
    model.add_node(GRU(embedding_dim, return_sequences=True, activation='sigmoid', inner_activation='sigmoid', init='he_uniform', inner_init='orthogonal'), name='shared_4_fwd', input='shared_3')
    model.add_node(GRU(embedding_dim, return_sequences=True, activation='sigmoid', inner_activation='sigmoid', init='he_uniform', inner_init='orthogonal', go_backwards=True), name='shared_4_bck', input='shared_3')
    model.add_node(TimeDistributedDense(embedding_dim, init='he_uniform'), name='shared_4', inputs=['shared_3', 'shared_4_fwd', 'shared_4_bck'], merge_mode='concat')

    # skip-gram model: skip-gram labels (doc, time_pad, offset)
    skipgram_out = skipgram_model(model, ['shared_1', 'x_word_rand'], max_len, embedding_dim, word2id_size, skipgram_offsets)
    model.add_output(name='y_skipgram', input=skipgram_out)
    loss['y_skipgram'] = 'mse'

    # POS model: POS tags (doc, time_pad, pos2id)
    pos_out = pos_model(model, ['shared_2'], max_len, embedding_dim, pos2id_size)
    model.add_output(name='y_pos', input=pos_out)
    loss['y_pos'] = 'binary_crossentropy'

    # PDTB marking model: discourse relation boundary markers (doc, time, offset, pdtbmark2id)
    pdtbmark_out = pdtbmark_model(model, ['shared_3'], max_len, embedding_dim, pdtbmark2id_size)
    model.add_output(name='y_pdtbmark', input=pdtbmark_out)
    loss['y_pdtbmark'] = 'binary_crossentropy'

    # PDTB pairs model: discourse relation span-pair occurrences (doc, time, offset, pdtbpair2id)
    pdtbpair_out = pdtbpair_model(model, ['shared_4'], max_len, embedding_dim, pdtbpair2id_size, pdtbpair_offsets)
    model.add_output(name='y_pdtbpair', input=pdtbpair_out)
    loss['y_pdtbpair'] = 'binary_crossentropy'

    model.compile(optimizer='rmsprop', loss=loss)
    return model
示例#10
0
 def LSTMModel(self, nHidden=150, lr=0.01):
     #               print('nHidden: %i\tlr: %.3f' % ( nHidden, lr) )
     self.rnnModel.add(
         GRU(nHidden,
             activation='sigmoid',
             input_shape=(None, self.maxFeatures),
             return_sequences=True))
     #                self.rnnModel.add(LSTM( nHidden, activation='sigmoid', input_shape =( None, nHidden), return_sequences=True))
     self.rnnModel.add(TimeDistributedDense(nHidden))
     self.rnnModel.add(Activation('relu'))
     self.rnnModel.add(TimeDistributedDense(self.maxFeatures))
     self.rnnModel.add(Activation('softmax'))
     rmsprop = RMSprop(lr=lr, rho=0.9, epsilon=1e-06)
     self.rnnModel.compile(loss='categorical_crossentropy',
                           optimizer=rmsprop)
示例#11
0
    def test_seq_to_seq(self):
        print('sequence to sequence data:')
        (X_train, y_train), (X_test,
                             y_test) = get_test_data(nb_train=1000,
                                                     nb_test=200,
                                                     input_shape=(3, 5),
                                                     output_shape=(3, 5),
                                                     classification=False)
        print('X_train:', X_train.shape)
        print('X_test:', X_test.shape)
        print('y_train:', y_train.shape)
        print('y_test:', y_test.shape)

        model = Sequential()
        model.add(
            TimeDistributedDense(y_train.shape[-1],
                                 input_shape=(X_train.shape[1],
                                              X_train.shape[2])))
        model.compile(loss='hinge', optimizer='rmsprop')
        history = model.fit(X_train,
                            y_train,
                            nb_epoch=12,
                            batch_size=16,
                            validation_data=(X_test, y_test),
                            verbose=0)
        self.assertTrue(history.history['val_loss'][-1] < 0.8)
示例#12
0
def get_RNN_model(in_shape,
                  td_num=512,
                  ltsm_out_dim=256,
                  nb_hidden=100,
                  drop1=0.5,
                  drop2=0.5):
    model = Sequential()

    model.add(GaussianNoise(0.05, input_shape=in_shape))
    model.add(TimeDistributedDense(td_num))
    model.add(LSTM(ltsm_out_dim, return_sequences=True))
    reg = l2(0.05)
    #    model.add(TimeDistributedDense(td_num, W_regularizer=l2(0.03)))
    #reg.set_param(model.layers[3].get_params()[0][0])
    #model.layers[3].regularizers = [reg]
    model.add(Dropout(drop1))

    model.add(LSTM(ltsm_out_dim))
    #  reg = l2(0.05)
    #  reg.set_param(model.layers[3].get_params()[0][0])
    #  model.layers[3].regularizers = [reg]
    model.add(Dropout(drop1))
    #    model.regularizers = [l2(0.05)]
    #model.add(Activation('relu'))

    model.add(Flatten())
    model.add(Dense(nb_hidden, W_regularizer=l2(0.05)))
    model.add(Activation('relu'))
    model.add(Dropout(drop2))

    model.add(Dense(1))
    model.add(Activation('linear'))

    model.compile(loss='mse', optimizer='rmsprop')
    return model
示例#13
0
def create_neural_network(freq_dimensions, hidden_dimensions, rec_units=1):
    model = Sequential()
    model.add(
        TimeDistributedDense(input_dim=freq_dimensions,
                             output_dim=hidden_dimensions))
    for i in range(rec_units):
        model.add(
            LSTM(input_dim=freq_dimensions,
                 output_dim=hidden_dimensions,
                 return_sequences=True))

    model.add(
        TimeDistributedDense(input_dim=hidden_dimensions,
                             output_dim=freq_dimensions))
    model.compile(loss='mean_squared_error', optimizer='rmsprop')
    return model
示例#14
0
def build_model():
    print('Build model...')
    graph = Graph()
    graph.add_input(name='input', ndim=3)
    graph.add_node(GRU(len_circ_repr, 128, return_sequences=True),
                   name='gru1',
                   input='input')
    graph.add_node(TimeDistributedDense(128, 128), name='tdd', input='gru1')
    graph.add_node(GRU(128, 128, return_sequences=False),
                   name='gru2',
                   input='tdd')
    graph.add_node(Dense(128, 120, activation='softmax'),
                   name='seconds',
                   input='gru2')
    graph.add_node(Dense(128, 120, activation='softmax'),
                   name='minutes',
                   input='gru2')
    graph.add_output(name='out1', input='seconds')
    graph.add_output(name='out2', input='minutes')

    print('Compile model...')
    graph.compile('rmsprop', {
        'out1': 'categorical_crossentropy',
        'out2': 'categorical_crossentropy'
    })
    return graph
def creat_binary_tag_LSTM( sourcevocabsize,targetvocabsize, source_W,input_seq_lenth ,output_seq_lenth ,
    hidden_dim ,emd_dim,loss='categorical_crossentropy',optimizer = 'rmsprop'):
    encoder_a = Sequential()
    encoder_b = Sequential()
    encoder_c = Sequential()
    l_A_embedding = Embedding(input_dim=sourcevocabsize+1,
                        output_dim=emd_dim,
                        input_length=input_seq_lenth,
                        mask_zero=True,
                        weights=[source_W])
    encoder_a.add(l_A_embedding)
    encoder_a.add(Dropout(0.3))
    encoder_b.add(l_A_embedding)
    encoder_b.add(Dropout(0.3))
    encoder_c.add(l_A_embedding)

    Model = Sequential()

    encoder_a.add(LSTM(hidden_dim,return_sequences=True))
    encoder_b.add(LSTM(hidden_dim,return_sequences=True,go_backwards=True))
    encoder_rb = Sequential()
    encoder_rb.add(ReverseLayer2(encoder_b))
    encoder_ab=Merge(( encoder_a,encoder_rb),mode='concat')
    Model.add(encoder_ab)

    decodelayer=LSTMDecoder_tag(hidden_dim=hidden_dim, output_dim=hidden_dim
                                         , input_length=input_seq_lenth,
                                        output_length=output_seq_lenth,
                                        state_input=False,
                                         return_sequences=True)
    Model.add(decodelayer)
    Model.add(TimeDistributedDense(targetvocabsize+1))
    Model.add(Activation('softmax'))
    Model.compile(loss=loss, optimizer=optimizer)
    return Model
示例#16
0
def prep_model(model, N, s0pad, s1pad, c):
    winputs = ['e0', 'e1']
    if c['wproject']:
        model.add_shared_node(name='wproj', inputs=winputs, outputs=['e0w', 'e1w'],
                              layer=TimeDistributedDense(output_dim=int(N*c['wdim']),
                                                         activation=c['wact']))
        winputs = ['e0w', 'e1w']

    model.add_shared_node(name='bow', inputs=winputs, outputs=['e0b', 'e1b'],
                          layer=TimeDistributedMerge(mode='ave'))
    bow_last = ('e0b', 'e1b')

    for i in range(c['deep']):
        bow_next = ('e0b[%d]'%(i,), 'e1b[%d]'%(i,))
        model.add_shared_node(name='deep[%d]'%(i,), inputs=bow_last, outputs=bow_next,
                              layer=Dense(output_dim=N, init=c['nninit'],
                                          activation=c['nnact'],
                                          W_regularizer=l2(c['l2reg'])))
        bow_last = bow_next

    # Projection
    if c['project']:
        model.add_shared_node(name='proj', inputs=bow_last, outputs=['e0p', 'e1p'],
                              layer=Dense(input_dim=N, output_dim=int(N*c['pdim']),
                                          activation=c['pact'],
                                          W_regularizer=l2(c['l2reg'])))
        return ('e0p', 'e1p')
    else:
        return bow_last
示例#17
0
def VGG19_hieratt(query_in_size, query_embed_size, nb_classes):
    """Stack hierarchical attention on pre-trained VGG19.
    Requires https://github.com/fchollet/deep-learning-models"""

    base_model = VGG19(weights='imagenet')
    input_image = base_model.input
    input_question = Input(shape=(query_in_size, ))  # question vector

    # Model up to 3rd block
    f_1 = Model(input=img_in,
                output=base_model.get_layer('block3_pool').output)
    f_1 = f_1(img_in)
    f_1 = Reshape((256, 28 * 28))(f_1)
    f_1 = Permute((2, 1))(f_1)

    q_1 = Dense(query_embed_size,
                activation='relu')(input_question)  # Encode question
    # Add question embedding to each feature column
    q_1 = RepeatVector(28 * 28)(q_1)
    q_f = merge([f_1, q_1], 'concat')
    # Estimate and apply attention per feature
    att_1 = TimeDistributedDense(1, activation="sigmoid")(q_f)
    att_1 = Lambda(repeat_1, output_shape=(28 * 28, 256))(att_1)
    att_1 = merge([f_1, att_1], 'mul')
    # Reshape to the original feature map from previous layer
    att_1 = Permute((2, 1))(att_1)
    f_1_att = Reshape((256, 28, 28))(att_1)

    model = Model(input=[img_in, input_question], output=f_1_att)
    print model.summary()
示例#18
0
def make_dense(X, y, num_layers, width, dropout):
    assert len(X.shape) == 2
    assert len(y.shape) == 2

    vocab_size = np.amax(X) + 1

    print 'Vocab size:', vocab_size

    m = Sequential()
    m.add(Embedding(vocab_size, 8))
    m.add(Dropout(dropout))

    m.add(TimeDistributedDense(8, 64))
    m.add(Flatten())

    m.add(BatchNormalization((64 * X.shape[1],)))
    m.add(PReLU((64 * X.shape[1],)))
    m.add(Dropout(dropout))
    m.add(Dense(64 * X.shape[1], width))

    for i in range(num_layers):
        m.add(BatchNormalization((width,)))
        m.add(PReLU((width,)))
        m.add(Dropout(dropout))
        m.add(Dense(width, width))

    m.add(BatchNormalization((width,)))
    m.add(PReLU((width,)))
    m.add(Dropout(dropout))
    m.add(Dense(width, y.shape[1]))

    m.add(Activation('softmax'))
    return m, 1
示例#19
0
def rel_types_model(model,
                    ins,
                    max_len,
                    embedding_dim,
                    rel_types2id_size,
                    focus,
                    pre='rtypes'):
    """Discourse relation types model as Keras Graph."""

    # prepare focus dimensionality
    model.add_node(RepeatVector(rel_types2id_size),
                   name=pre + '_focus_rep',
                   input=focus)
    model.add_node(Permute((2, 1)),
                   name=pre + '_focus',
                   input=pre + '_focus_rep')

    # discourse relation types dense neural network (sample, time_pad, rel_types2id)
    model.add_node(TimeDistributedDense(rel_types2id_size, init='he_uniform'),
                   name=pre + '_dense',
                   input=ins[0])
    model.add_node(Activation('softmax'),
                   name=pre + '_softmax',
                   input=pre + '_dense')

    # multiplication to focus the activations (doc, time_pad, rel_types2id)
    model.add_node(Activation('linear'),
                   name=pre + '_out',
                   inputs=[pre + '_focus', pre + '_softmax'],
                   merge_mode='mul')
    return pre + '_out'
示例#20
0
def build_net():
    ng = NumberGenerator(lambda x, y: x + y)

    # parameters
    n_epochs = 10
    training_size = 50000

    rnn = recurrent.LSTM
    hidden_size = 128
    batch_size = 128
    layers = 1

    print('Building model...')
    model = Sequential()
    model.add(rnn(hidden_size))

    for _ in range(layers):
        model.add(rnn(hidden_size, return_sequences=True))

    model.add(TimeDistributedDense(2))
    model.add(Activation('softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')

    for epoch, epoch_data in enumerate(
            ng.generate_data(n_epochs, size=training_size)):
        print('\n' + '-' * 50 + '\nIteration %d', epoch)
        model.fit(epoch_data[0],
                  epoch_data[1],
                  batch_size=batch_size,
                  show_accuracy=True)
示例#21
0
def get_enc2dec(RNN, HIDDEN_SIZE=128, LAYERS=1, DIM=100, MAXLEN=100):
    """
	Enc-Dec Model
	see Vinyals et. al. 2014 http://arxiv.org/pdf/1412.7449v1.pdf
	"""
    model = Graph()

    model.add_input(name='input', input_shape=(None, DIM))
    model.add_node(RNN(HIDDEN_SIZE, return_sequences=True),
                   name='e_r0',
                   input='input')

    prev_node = 'e_r0'
    for layer in xrange(LAYERS - 1):
        model.add_node(RNN(HIDDEN_SIZE, return_sequences=True),
                       name='e_r' + str(layer + 1),
                       input=prev_node)
        prev_node = 'e_r' + str(layer + 1)

    model.add_node(RNN(HIDDEN_SIZE), name='e_final', input=prev_node)
    model.add_node(RepeatVector(MAXLEN), name='encoder', input='e_final')

    prev_node = 'encoder'
    for layer in xrange(LAYERS - 1):
        model.add_node(RNN(HIDDEN_SIZE, return_sequences=True),
                       name='d_r' + str(layer + 1),
                       input=prev_node)
        prev_node = 'd_r' + str(layer + 1)

    model.add_node(TimeDistributedDense(MAXLEN), name='d_tdd', input=prev_node)
    model.add_node(Activation('softmax'), name='softmax', input='d_tdd')
    model.add_output(name='output', input='softmax')

    return model
示例#22
0
def build_RNN_model(vocab_size, embedding_dims, rnn_layer_dim, num_classes):
    """Build the RNN model"""
    model = Sequential()  # Sequential model
    # Embedding layer
    model.add(Embedding(vocab_size, embedding_dims))
    # Recurrent layer
    model.add(
        SimpleRNN(int(rnn_layer_dim),
                  init='glorot_uniform',
                  inner_init='orthogonal',
                  activation='tanh',
                  W_regularizer=None,
                  U_regularizer=None,
                  b_regularizer=None,
                  dropout_W=0.0,
                  dropout_U=0.0,
                  return_sequences=True,
                  stateful=False))
    # Time distributed dense layer (activation is softmax, since it is a classification problem)
    model.add(
        TimeDistributedDense(num_classes,
                             init='glorot_uniform',
                             activation='softmax'))

    return model
示例#23
0
  def build_model(self, params):
    hidden_layers = params['hidden_layers']
    input_dim = params['feat_size']
    output_dim = params['phone_vocab_size']
    drop_prob = params['drop_prob_encoder']
    self.nLayers = len(hidden_layers)

    # First Layer is an encoder layer
    
    self.model.add(TimeDistributedDense(hidden_layers[0], init='glorot_uniform', input_dim=input_dim))
    self.model.add(Dropout(drop_prob))
    
    # Second Layer is the Recurrent Layer 
    if params.get('recurrent_type','simple') == 'simple':
        self.model.add(SimpleRNN(hidden_layers[1], init='glorot_uniform', inner_init='orthogonal',
            activation='sigmoid', weights=None, truncate_gradient=-1, return_sequences=False, 
            input_dim=hidden_layers[0], input_length=None))
    elif params.get('recurrent_type','simple') == 'lstm':
        self.model.add(LSTM(hidden_layers[1], init='glorot_uniform', inner_init='orthogonal',
            input_dim=hidden_layers[0], input_length=None))

    # Then we add dense projection layer to map the RNN outputs to Vocab size 
    self.model.add(Dropout(drop_prob))
    self.model.add(Dense(output_dim, input_dim=hidden_layers[1], init='uniform'))
    self.model.add(Activation('softmax'))
  
    self.solver = getSolver(params)
    self.model.compile(loss='categorical_crossentropy', optimizer=self.solver)
    #score = model.evaluate(test_x)
    self.f_train = self.model.train_on_batch

    return self.f_train
def test_sequence_to_sequence():
    '''
    Apply a same Dense layer for each element of time dimension of the input
    and make predictions of the output sequence elements.
    This does not make use of the temporal structure of the sequence
    (see TimeDistributedDense for more details)
    '''
    np.random.seed(1337)
    (X_train, y_train), (X_test, y_test) = get_test_data(nb_train=500,
                                                         nb_test=200,
                                                         input_shape=(3, 5),
                                                         output_shape=(3, 5),
                                                         classification=False)

    model = Sequential()
    model.add(
        TimeDistributedDense(y_train.shape[-1],
                             input_shape=(X_train.shape[1], X_train.shape[2])))
    model.compile(loss='hinge', optimizer='rmsprop')
    history = model.fit(X_train,
                        y_train,
                        nb_epoch=20,
                        batch_size=16,
                        validation_data=(X_test, y_test),
                        verbose=0)
    assert (history.history['val_loss'][-1] < 0.8)
示例#25
0
def train_model(dataset, h0_dim, h1_dim, out_dim):
    X_train, y_train, X_test, y_test = dataset
    batch_size = 128
    nb_epoch = 100

    model = Sequential()
    model.add(
        RNN(h0_dim,
            input_shape=(None, X_train.shape[-1]),
            return_sequences=True))
    model.add(TimeDistributedDense(out_dim))
    model.add(Activation("linear"))
    model.compile(loss="mse", optimizer="rmsprop")
    #model.get_config(verbose=1)
    #yaml_string = model.to_yaml()
    #with open('ifshort_mlp.yaml', 'w') as f:
    #    f.write(yaml_string)

    early_stopping = EarlyStopping(monitor='val_loss', patience=10)
    checkpointer = ModelCheckpoint(filepath="/tmp/ifshort_rnn_weights.hdf5",
                                   verbose=1,
                                   save_best_only=True)
    model.fit(X_train,
              y_train,
              batch_size=batch_size,
              nb_epoch=nb_epoch,
              show_accuracy=False,
              verbose=2,
              validation_data=(X_test, y_test),
              callbacks=[early_stopping, checkpointer])
示例#26
0
def train_rnn(character_corpus, seq_len, train_test_split_ratio):
    model = Sequential()
    model.add(Embedding(character_corpus.char_num(), 256))
    model.add(LSTM(256, 5120, activation='sigmoid', inner_activation='hard_sigmoid', return_sequences=True))
    model.add(Dropout(0.5))
    model.add(TimeDistributedDense(5120, character_corpus.char_num()))
    model.add(Activation('time_distributed_softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

    seq_X, seq_Y = character_corpus.make_sequences(seq_len)

    print "Sequences are made"

    train_seq_num = train_test_split_ratio*seq_X.shape[0]
    X_train = seq_X[:train_seq_num]
    Y_train = to_time_distributed_categorical(seq_Y[:train_seq_num], character_corpus.char_num())

    X_test = seq_X[train_seq_num:]
    Y_test = to_time_distributed_categorical(seq_Y[train_seq_num:], character_corpus.char_num())

    print "Begin train model"
    checkpointer = ModelCheckpoint(filepath="model.step", verbose=1, save_best_only=True)
    model.fit(X_train, Y_train, batch_size=256, nb_epoch=100, verbose=2, validation_data=(X_test, Y_test), callbacks=[checkpointer])

    print "Model is trained"

    score = model.evaluate(X_test, Y_test, batch_size=512)

    print "valid score = ", score

    return model
示例#27
0
 def __init__(self,
              output_dim,
              hidden_dim,
              output_length,
              depth=1,
              dropout=0.25,
              **kwargs):
     super(SimpleSeq2seq, self).__init__()
     if type(depth) not in [list, tuple]:
         depth = (depth, depth)
     self.encoder = LSTM(hidden_dim, **kwargs)
     self.decoder = LSTM(hidden_dim if depth[1] > 1 else output_dim,
                         return_sequences=True,
                         **kwargs)
     for i in range(1, depth[0]):
         self.add(LSTM(hidden_dim, return_sequences=True, **kwargs))
         self.add(Dropout(dropout))
     self.add(self.encoder)
     self.add(Dropout(dropout))
     self.add(RepeatVector(output_length))
     self.add(self.decoder)
     for i in range(1, depth[1]):
         self.add(LSTM(hidden_dim, return_sequences=True, **kwargs))
         self.add(Dropout(dropout))
     if depth[1] > 1:
         self.add(TimeDistributedDense(output_dim))
示例#28
0
    def train_seq2seq(self):
        print "Input sequence read, starting training"
        #X_train = sequence.pad_sequences(self.X_train, maxlen=self.maxlen)
        #Y_train = sequence.pad_sequences(self.Y_train, maxlen=self.maxlen)
        #X_val = sequence.pad_sequences(self.X_val, maxlen=self.maxlen)
        #y_val = sequence.pad_sequences(self.Y_val, maxlen=self.maxlen)
        #X_test = sequence.pad_sequences(self.X_test, maxlen=self.maxlen)
        #Y_test = sequence.pad_sequences(self.Y_test, maxlen=self.maxlen)

        model = Sequential()
        model.add(
            Embedding(len(self.proproces.vocab_hind),
                      30,
                      input_length=self.maxlen))
        model.add(RNN(30))  #, input_shape=(100, 128)))
        model.add(RepeatVector(self.maxlen))
        model.add(RNN(30, return_sequences=True))
        model.add(TimeDistributedDense(len(self.proproces.vocab_en)))
        model.add(Activation('softmax'))

        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        for e in range(100):
            print("epoch %d" % e)
            for (X, Y) in self.proproces.gen_seq(
                    "../indian-parallel-corpora/hi-en/tok/dev.hi-en.en.0",
                    "../indian-parallel-corpora/hi-en/tok/dev.hi-en.hi", 128):
                loss, acc = model.train_on_batch(
                    X, Y)  #, batch_size=64, nb_epoch=1)
                print("Loss is %f, accuracy is %f " % (loss, acc))
            # After one epoch test one sentence
            if e % 5 == 0:
                print("Enter sentence in hindi")
                inp = raw_input().decode("utf-8")
                tokens = inp.split()
                seq = []
                for token in tokens:
                    if token in self.proproces.vocab_hind:
                        seq.append(self.proproces.vocab_hind[token])
                    else:
                        token = "UNK"
                        seq.append(self.proproces.vocab_hind[token])
                #seq = map(lambda x:self.proproces.vocab_hind[x], tokens)
                # Normalize seq to maxlen
                X = []
                X.append(seq)
                print X
                temp = sequence.pad_sequences(X, maxlen=self.maxlen)
                #temp[0:len(seq)] = seq
                print len(temp)
                #temp = np.asarray(temp).reshape(128,)
                print temp.shape
                prob = model.predict_on_batch(
                    temp)  #, batch_size=1, verbose=0)
                translated = self.decode(prob)
                print("Tranlated is", translated)
                print("Probabilities are", prob)
                print("Shape of prob tensor is", prob.shape)
示例#29
0
    def __init__(self, positive_weight, _num_of_hidden_units):
        super(LSTM_CNN_EEG, self).__init__()
        self.positive_weight = positive_weight
        self._num_of_hidden_units = _num_of_hidden_units
        '''
        define the neural network model:

        '''
        # from keras.layers.extra import *
        import numpy as np
        from keras.models import Sequential
        from keras.layers.recurrent import LSTM
        from keras.layers.core import Dense, Dropout, Activation
        from keras.regularizers import l2

        from keras.datasets import mnist
        from keras.models import Sequential
        # from keras.initializations import norRemal, identity
        from keras.layers.recurrent import SimpleRNN, LSTM, GRU
        from keras.optimizers import RMSprop, Adadelta
        from keras.layers.convolutional import Convolution2D, Convolution1D
        from keras.layers.core import Dense, Activation, TimeDistributedDense, Dropout, Reshape, Flatten
        # from keras.layers.wrappers import TimeDistributed
        from keras.models import model_from_json
        from keras.layers.convolutional import MaxPooling1D, MaxPooling2D
        from keras.layers.core import Permute

        size = 28
        maxToAdd = 200
        # define our time-distributed setup
        model = Sequential()

        model.add(TimeDistributedDense(10, input_shape=(maxToAdd, 55)))
        # model.add(Convolution2D(1, 1, 10, border_mode='valid', input_shape=(1,maxToAdd, 55)))
        model.add(Activation('tanh'))
        model.add(Reshape(
            (1, maxToAdd, 10)))  # this line updated to work with keras 1.0.2
        model.add(Convolution2D(3, 20, 1, border_mode='valid'))  # org
        model.add(Activation('tanh'))
        model.add(Convolution2D(1, 1, 1, border_mode='same'))  # org
        model.add(Activation('tanh'))
        model.add(MaxPooling2D(pool_size=(20, 1), border_mode='valid'))
        model.add(Permute((2, 1, 3)))
        model.add(Reshape(
            (9, 10)))  # this line updated to work with keras 1.0.2
        model.add(GRU(output_dim=20, return_sequences=False))
        #
        model.add(Dense(2, activation='softmax'))

        model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
        self.model = model

        # model.predict(np.random.rand(28, 200, 55).astype(np.float32)).shape

        print model.layers[-1].output_shape
        # print "2 {} {}".format(model.layers[1].output_shape[-3:], (1, maxToAdd, np.prod(model.layers[1].output_shape[-3:])))
        self.original_weights = self.model.get_weights()
        """ :type Sequential"""
示例#30
0
    def train_seq2seq(self):
        print "Input sequence read, starting training"
        #X_train = sequence.pad_sequences(self.X_train, maxlen=self.maxlen)
        #Y_train = sequence.pad_sequences(self.Y_train, maxlen=self.maxlen)
        #X_val = sequence.pad_sequences(self.X_val, maxlen=self.maxlen)
        #y_val = sequence.pad_sequences(self.Y_val, maxlen=self.maxlen)
        #X_test = sequence.pad_sequences(self.X_test, maxlen=self.maxlen)
        #Y_test = sequence.pad_sequences(self.Y_test, maxlen=self.maxlen)

        model = Sequential()
        #model.add(Embedding(len(self.proproces.vocab_hind), 100,
        #                    input_length=self.maxlen))
        model.add(
            RNN(80, input_shape=(self.maxlen, len(self.proproces.vocab_hind))))
        model.add(RepeatVector(self.maxlen))
        model.add(RNN(80, return_sequences=True))
        model.add(TimeDistributedDense(len(self.proproces.vocab_en)))
        model.add(Activation('softmax'))

        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
        for e in range(1, 2000):
            print("epoch %d" % e)
            for (X, Y) in self.proproces.gen_seq(
                    "../indian-parallel-corpora/hi-en/tok/dev.hi-en.en.0",
                    "../indian-parallel-corpora/hi-en/tok/dev.hi-en.hi", 64):
                loss, acc = model.train_on_batch(
                    X, Y)  #, batch_size=64, nb_epoch=1)
                print("Loss is %f, accuracy is %f " % (loss, acc))
            # After one epoch test one sentence
            if e % 10 == 0:
                print("Enter sentence in hindi")
                inp = raw_input().decode("utf-8")
                tokens = inp.split()
                seq = []
                for token in tokens:
                    if token in self.proproces.vocab_hind:
                        seq.append(self.proproces.vocab_hind[token])
                    else:
                        token = "UNK"
                        seq.append(self.proproces.vocab_hind[token])
                #seq = map(lambda x:self.proproces.vocab_hind[x], tokens)
                # Normalize seq to maxlen
                X = []
                x = []
                temp = [0] * (self.maxlen)
                temp[0:len(seq)] = seq
                for ind in temp:
                    t = [0] * len(self.proproces.vocab_hind)
                    t[ind] = 1
                    x.append(t)
                X.append(x)
                X = np.asarray(X)
                print len(X)
                prob = model.predict(X)
                self.decode(prob)
                print("Probabilities are", prob)
示例#31
0
def test_lstm():
    
    # load wiki data
    X_train_np, X_valid_np, X_test_np = gen_data_wiki()
    batchsize = 100
    blocklength = 25000 #450000
    bsize_test = batchsize 
    numframe = 100
    numframe_test = 1250#2500#5000 
    X_valid = onehot(X_valid_np).reshape(bsize_test, X_valid_np.shape[0]/bsize_test, 205)
    X_test = onehot(X_test_np).reshape(bsize_test, X_test_np.shape[0]/bsize_test, 205)
    nb_classes= 205 

    X_train_shared = theano.shared(np.zeros((batchsize,blocklength, nb_classes)).astype('float32'), name = 'train_set', borrow=True)
    X_valid_shared = theano.shared(np.zeros((bsize_test, numframe_test, nb_classes)).astype('float32'), name = 'valid_set', borrow=True)
    X_test_shared = theano.shared(np.zeros((bsize_test, numframe_test, nb_classes)).astype('float32'), name = 'test_set', borrow=True)

    # build the model
    from keras.layers.recurrent import LSTM, SimpleRNN, LSTMgrave 
    from layer_icml import LSTM_bu, LSTM_td, RNN_td, RNN_bu, RNN_sh, RNN_dp, LSTM_dp, RNN_shallow 
    from layer_icml import RNN_relugate, RNN_ens, RNN_2tanh, RNN_ntanh, RNN_multidp, LSTM_multi, LSTM_u, RNN_utanh, LSTM_uu, LSTM_uugrave 
    from keras.layers.core import Dense, Activation, TimeDistributedDense
    from keras.initializations import normal, identity

    x = T.tensor3()
    y = T.matrix()

    name_init = 'uniform'
    n_h = 2450; L1 = LSTMgrave(output_dim = n_h, init = 'uniform', batchsize = batchsize, inner_init = 'uniform',input_shape = (None, nb_classes), return_sequences=True); name_model= 'lstm_shallowgrave_' + str(n_h) + name_init + '0.01'+ '_batchsize' + str(batchsize) + '_numframe' + str(numframe)

    # RNN
    name_act = 'tanh'; name_init = 'uniform' 
    #n_h=2048;L1 = RNN_shallow(output_dim = n_h, init = 'uniform', U_init = name_init, activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_tanh" + str(n_h) + "_"+name_act+ name_init + '0.1'
    #n_h = 2048;L1 = SimpleRNN(output_dim = n_h, init = 'uniform', inner_init = 'uniform', activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_shallow"+str(n_h)+name_act+ name_init + '0.05'
    #n_h = 4096;L1 = RNN_utanh(output_dim = n_h, init = 'uniform', U_init = name_init, activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_utanh_2_0_0" + str(n_h) + "_"+name_act+ name_init +'0.01' 
    n_h = 2048; in_act = 'tanh';L1 = LSTM_uugrave(output_dim = n_h, batchsize = batchsize, init = 'uniform', inner_init = 'uniform', input_shape = (None, nb_classes), return_sequences=True); name_model= 'lstm_u_grave'+in_act+'_1.0_1.0_1.0_0' + str(n_h) + name_init + '0.01' + '_batchsize' + str(batchsize) + '_numframe' + str(numframe)
    #n_h = 1200; in_act = 'tanh';L2 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform', input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_stack2'+in_act+'_1.0_1.0_1.0_0' + str(n_h) + name_init + '0.01'
    #n_h = 700; L2 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03'
    #n_h = 700; L3 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03'
    #n_h = 700; L4 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03'
    #n_h = 700; L5 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= '7005layerlstm_uu_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03'

    D1 = TimeDistributedDense(nb_classes);D1._input_shape = [None, None, n_h]
    O = Activation('softmax')

    #layers = [L1, L2, L3, L4, L5, D1, O]
    layers = [L1, D1, O]
    #layers = [L1, L2, D1, O]

    load_model = True 
    if load_model:
        #f_model = open('/data/lisatmp3/zhangsa/lstm/models/180rnn_td_reluidentityotherinit_identity_sgd0.1_clip10.pkl', 'rb')
        #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtestfinetune5e-4inorder_withtest.pkl', 'rb')
        #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.pkl', 'rb')
        #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.pkl', 'rb')
        f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_0.5_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtestfinetune1e-5inorder_withtest.pkl', 'rb')
        layers = pickle.load(f_model)
        f_model.close()
        name_model_load = 'wiki100lstm_u_gravetanh_1.0_0.5_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest' + 'finetune2e-6'
        #name_perpmat_load = 'wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.npy'
        L1 = layers[0]

    out  = x
    params = []
    for l in layers: 
        if not load_model:
            l.build()
        l.input = out
        params += l.params
        if l == L1:
            out = l.get_output()[0]
            h0 = l.get_output()[0]
            c0 = l.get_output()[1]
        else:
            out = l.get_output()

    # compute the loss
    loss = -T.mean(T.log(out)[:,:numframe-1,:] *x[:,1:,:])
    logperp_valid = T.mean(-T.log2(T.sum(out[:,:numframe_test-1,:]*x[:,1:,:],axis=2)))
    logperp_train = T.mean(-T.log2(T.sum(out[:,:numframe-1,:]*x[:,1:,:],axis=2)))

    # set optimizer
    from keras.constraints import identity as ident 
    from keras.optimizers import RMSprop, SGD, Adam

    lr_ = 2*1e-6
    clipnorm_ = 10000
    rmsprop = RMSprop(lr=lr_, clipnrom = clipnorm_)
    sgd = SGD(lr=lr_, momentum=0.9, clipnorm=clipnorm_)
    adam = Adam(lr=lr_)

    #opt = sgd; name_opt = 'sgd'+str(lr_); clip_flag = False 
    #opt = rmsprop; name_opt = 'rmsprop'+str(lr_)
    opt = adam; name_opt = 'adam' + str(lr_); clip_flag = False

    if clip_flag: 
        name_opt = name_opt + '_clip'+str(clipnorm_)

    #param update for regular parameters
    constraints = [ident() for p in params]    
    updates = opt.get_updates(params, constraints, loss)

    index = T.iscalar()
    f_train = theano.function([index], [loss, h0, c0], updates = updates,
            givens={x:X_train_shared[:,index*numframe : (index+1)*numframe, :]})

    # perplexity function
    f_perp_valid = theano.function([], [logperp_valid, h0, c0], givens={x:X_valid_shared})
    f_perp_test = theano.function([], [logperp_valid, h0, c0], givens={x:X_test_shared})

    #f_perp_valid = theano.function([index], [logperp_valid], givens={x:X_valid_shared[index*bsize_test : (index+1)*bsize_test]})
    #f_perp_test = theano.function([index], [logperp_valid], givens={x:X_test_shared[index*bsize_test : (index+1)*bsize_test]})


    def perp_valid():
        logperp_acc = 0
        n = 0
        L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
        L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
        for k in xrange(X_valid.shape[1]/numframe_test):
            X_valid_shared.set_value(X_valid[:, k*numframe_test:(k+1)*numframe_test, :])
            perp, h0, c0 = f_perp_valid()
            logperp_acc += perp
            L1.H0.set_value(h0[:,-1,:])
            L1.C0.set_value(c0[:,-1,:])
            n += 1
        return (logperp_acc/n)

    def perp_test():
        logperp_acc = 0
        n = 0
        L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
        L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
        for k in xrange(X_test.shape[1]/numframe_test):
            X_test_shared.set_value(X_test[:, k*numframe_test:(k+1)*numframe_test, :])
            perp, h0, c0 = f_perp_test()
            logperp_acc += perp
            L1.H0.set_value(h0[:,-1,:])
            L1.C0.set_value(c0[:,-1,:])
            n += 1
        return (logperp_acc/n)


    #def perp_valid():
    #    logperp_acc = 0
    #    n = 0
    #    for k in xrange(X_valid_np.shape[0]/(bsize_test*numframe_test)):
    #        X_valid_shared.set_value(onehot(X_valid_np[k*bsize_test*numframe_test:(k+1)*bsize_test*numframe_test]).reshape((bsize_test, numframe_test, 205)))
    #        for i in xrange(X_valid_shared.get_value().shape[0]/bsize_test):
    #            logperp_acc += f_perp_valid(i)
    #            n += 1
    #    return (logperp_acc/n)

    #def perp_test():
    #    logperp_acc = 0
    #    n = 0
    #    for k in xrange(X_test_np.shape[0]/(bsize_test*numframe_test)):
    #        X_test_shared.set_value(onehot(X_test_np[k*bsize_test*numframe_test:(k+1)*bsize_test*numframe_test]).reshape((bsize_test, numframe_test, 205)))
    #        for i in xrange(X_test_shared.get_value().shape[0]/bsize_test):
    #            logperp_acc += f_perp_test(i)
    #            n += 1
    #    return (logperp_acc/n)


    ######## testmodel ########
    #test_score = perp_valid()
    #pdb.set_trace()


    epoch_ = 9000 
    perpmat = np.zeros((epoch_, 3))
    t_start = time.time()
    name = 'wiki100'+ name_model + '_' +  name_opt 

    if load_model:
        name = name_model_load 
        #perpmat = np.load(name_perpmat_load)

    #only_block = False
    #if only_block:
    #    name = name + 'random_only_block'
    #else:
    #    name = name + 'random_per_row_in_block'
    name = name+'inorder'
    blocksize = batchsize*blocklength
    bestscore = 100000000
    for epoch in xrange(epoch_):
        for k in xrange(X_train_np.shape[0]/blocksize):
            t_s = time.time()
            print "reloading " + str(k) + " th train patch..."

            #if only_block:
            #    pos = np.random.randint(0, X_train_np.shape[0]-blocksize)
            #    X_train_shared.set_value(onehot(X_train_np[pos: pos + blocksize]).reshape(batchsize, blocklength, 205))
            #else:    
            #    pos = np.random.randint(0, X_train_np.shape[0]-blocklength, batchsize)
            #    tmp = np.zeros((batchsize, blocklength, 205)).astype('float32')
            #    for j in xrange(batchsize):
            #        tmp[j] = onehot(X_train_np[pos[j]: pos[j] + blocklength])
            #    X_train_shared.set_value(tmp)
            X_train_shared.set_value(onehot(X_train_np[k*blocksize: (k+1)*blocksize]).reshape(batchsize, blocklength, 205)) 
            print "reloading finished, time cost: " + str(time.time()-t_s)
            L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
            L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32'))
            for i in xrange(blocklength/numframe):
                loss, h0, c0 = f_train(i)
                L1.H0.set_value(h0[:,-1,:])
                L1.C0.set_value(c0[:,-1,:])
                if i%10 == 0:
                    t_end = time.time()
                    print "Time consumed: " + str(t_end - t_start) + " secs."
                    t_start = time.time()
                    print "Epoch "+ str(epoch)+" " + name + ": The training loss in batch " + str(k*(blocklength/numframe)+i) +" is: " + str(loss) + "."
            if k%6 == 0:
                #save results
                m = epoch*X_train_np.shape[0]/(blocksize*6) +k/6
                perpmat[m][0], perpmat[m][1] = 0, perp_valid()
                perpmat[m][2] = perp_test()
                np.save('/data/lisatmp4/zhangsa/rnn_trans/results/' + name +'_withtest.npy', perpmat)

                #save model
                if perpmat[m][1] < bestscore:
                    bestscore = perpmat[m][1]
                    f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/' + name + '_withtest.pkl', 'wb+')
                    pickle.dump(layers, f_model)
                    f_model.close()
       
        print "Epoch "+ str(epoch)+ " " + name + ": The training perp is: " + str(perpmat[epoch][0]) \
                      + ", test perp is: " + str(perpmat[epoch][1]) + "."