def __init__(self, output_dim, hidden_dim, output_length, depth=1,bidirectional=True, dropout=0.25, **kwargs): if bidirectional and hidden_dim % 2 != 0: raise Exception ("hidden_dim for AttentionSeq2seq should be even (Because of bidirectional RNN).") super(AttentionSeq2seq, self).__init__() if type(depth) not in [list, tuple]: depth = (depth, depth) if bidirectional: encoder = Bidirectional(LSTMEncoder(output_dim=hidden_dim / 2, state_input=False, return_sequences=True, **kwargs)) else: encoder = LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs) decoder = AttentionDecoder(hidden_dim=hidden_dim, output_length=output_length, state_input=False, **kwargs) lstms = [] for i in range(1, depth[0]): if bidirectional: layer = Bidirectional(LSTMEncoder(output_dim=hidden_dim / 2, state_input=False, return_sequences=True, **kwargs)) else: layer = LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs) self.add(layer) lstms.append(layer) self.add(Dropout(dropout)) self.add(encoder) self.add(Dropout(dropout)) self.add(TimeDistributedDense(hidden_dim if depth[1] > 1 else output_dim)) lstms.append(encoder) self.add(decoder) lstms = [decoder] for i in range(1, depth[1]): layer = LSTMEncoder(output_dim=hidden_dim, state_input=False, return_sequences=True, **kwargs) self.add(layer) lstms.append(layer) self.add(Dropout(dropout)) if depth[1] > 1: self.add(TimeDistributedDense(output_dim)) self.encoder = encoder self.decoder = decoder
def buildLSTMModel(layerCount, input_dim, added_dim): model = Sequential() model.add(TimeDistributedDense(input_dim=input_dim, output_dim=input_dim + added_dim)) for lcount in range(layerCount): model.add(LSTM(input_dim=input_dim + added_dim, output_dim=input_dim + added_dim, return_sequences=True)) model.add(TimeDistributedDense(input_dim=input_dim + added_dim, output_dim=input_dim)) return model
def getModel(LSTM_HIDDEN_STATES=300, FIRST_DROPOUT=0.0, SECOND_DROPOUT=0.0, DENSE_LAYERS_SIZE=300, seqLength=64, word_data_dim=100, char_data_dim=43, optimizer='rmsprop', lr=0.001): nb_classes = 2 nb_filters = 10 decoder = Graph() decoder.add_input(name='input1', input_shape=(seqLength, word_data_dim), dtype='float') decoder.add_input(name='input2', input_shape=(seqLength, char_data_dim), dtype='float') decoder.add_node(Dropout(0.0), name='mergedInput', inputs=['input1', 'input2']) #decoder.add_node(Masking(mask_value=0.,), input = 'mergedInput', name='maskedInput') decoder.add_node(LSTM(LSTM_HIDDEN_STATES, return_sequences=True), input='mergedInput', name='LSTMForward') decoder.add_node(LSTM(LSTM_HIDDEN_STATES, return_sequences=True, go_backwards=True), input='mergedInput', name='LSTMBackward') decoder.add_node(Dropout(FIRST_DROPOUT), name='firstDropout', inputs=['LSTMForward', 'LSTMBackward']) decoder.add_node(TimeDistributedDense(DENSE_LAYERS_SIZE, activation='relu'), name='tdd1', input='firstDropout') decoder.add_node(Dropout(SECOND_DROPOUT), name='secondDropout', input='tdd1') decoder.add_node(TimeDistributedDense(nb_classes, activation='softmax'), input='secondDropout', name='tdd2') decoder.add_output(name='output', input='tdd2') if optimizer == 'rmsprop': optimizer = RMSprop(lr) elif optimizer == 'sgd': optimizer = SGD(lr) decoder.compile(optimizer, {'output': 'categorical_crossentropy'}, metrics=['accuracy']) return decoder
def get_outputs(X_batch): l2_reg = 0.01 stim_shape = (numTime, 40, 50, 50) RMSmod = RMSprop(lr=0.001, rho=0.99, epsilon=1e-6) num_filters = (8, 16) filter_size = (13, 13) weight_init = 'he_normal' batchsize = 100 model = Sequential() # first convolutional layer model.add(TimeDistributedConvolution2D(num_filters[0], filter_size[0], filter_size[1], input_shape=stim_shape, border_mode='same', subsample=(1,1), W_regularizer=l2(l2_reg))) #Add relu activation separately for threshold visualizations model.add(Activation('relu')) # max pooling layer model.add(TimeDistributedMaxPooling2D(pool_size=(2, 2), ignore_border=True)) # flatten model.add(TimeDistributedFlatten()) # Add dense (affine) layer with relu activation model.add(TimeDistributedDense(num_filters[1], W_regularizer=l2(l2_reg), activation='relu')) # Add LSTM, forget gate bias automatically initialized to 1, default weight initializations recommended model.add(LSTM(100*num_filters[1], return_sequences=True)) # # Add a final dense (affine) layer with softplus activation model.add(TimeDistributedDense(1, init=weight_init, W_regularizer=l2(l2_reg), activation='softplus')) model.compile(loss='poisson_loss', optimizer=RMSmod) model.load_weights(weights_dir) if not memories: get_outputs = theano.function([model.layers[0].input], model.layers[5].get_output(train=False)) outputs = get_outputs(X_batch) else: model2 = Sequential() model2.add(TimeDistributedConvolution2D(num_filters[0], filter_size[0], filter_size[1], input_shape=stim_shape, weights=model.layers[0].get_weights(), border_mode='same', subsample=(1,1), W_regularizer=l2(l2_reg))) #Add relu activation separately for threshold visualizations model2.add(Activation('relu')) # max pooling layer model2.add(TimeDistributedMaxPooling2D(pool_size=(2, 2), ignore_border=True)) # flatten model2.add(TimeDistributedFlatten()) # Add dense (affine) layer with relu activation model2.add(TimeDistributedDense(num_filters[1], weights=model.layers[4].get_weights(), W_regularizer=l2(l2_reg), activation='relu')) # Add LSTM, forget gate bias automatically initialized to 1, default weight initializations recommended model2.add(LSTMMem(100*num_filters[1], weights=model.layers[5].get_weights(), return_memories=True)) model2.compile(loss='poisson_loss', optimizer=RMSmod) get_outputs = theano.function([model2.layers[0].input], model2.layers[5].get_output(train=False)) outputs = get_outputs(X_batch) return outputs
def __init__(self, input_dim, input_length, output_dim, init='glorot_uniform', inner_init='orthogonal', forget_bias_init='one', activation='tanh', inner_activation='hard_sigmoid', weights=None, truncate_gradient=-1, hidden_state=None, batch_size=None, depth=1, remember_state=False, inner_return_sequences=True, return_sequences=True): if not weights: weights = [None]*5#No weights for merge layer if not hidden_state: hidden_state = [None]*6 super(StatefulContainer, self).__init__() forward = DeepLSTM(input_dim=input_dim*2, output_dim=output_dim, input_length=input_length, weights=weights[2], hidden_state=hidden_state[2], batch_size=batch_size,depth=depth, remember_state=remember_state, inner_return_sequences=inner_return_sequences, return_sequences=return_sequences, init='glorot_uniform', inner_init='orthogonal', forget_bias_init='one', activation='tanh', inner_activation='hard_sigmoid') reverse = DeepLSTM(input_dim=input_dim*2, output_dim=output_dim, input_length=input_length, weights=weights[3], hidden_state=hidden_state[3], batch_size=batch_size,depth=depth, remember_state=remember_state, inner_return_sequences=inner_return_sequences, return_sequences=return_sequences, init='glorot_uniform', inner_init='orthogonal', forget_bias_init='one', activation='tanh', inner_activation='hard_sigmoid', go_backwards=True) #A common input to both forward and reverse LSTMs #This layer learns a direction invariant representation of your input data self.add(TimeDistributedDense(input_dim=input_dim, output_dim=input_dim*2, input_length=input_length)) if weights[0]: self.layers[0].set_weights(weights[0]) self.add(Dropout(0.7)) self.add(forward) self.add(reverse) reverse.set_previous(forward.layers[0].previous)#Woah! merge = Merge([forward, reverse], mode='concat', concat_axis=-1) layers = self.layers[:2] for l in layers: params, regs, consts, updates = l.get_params() merge.regularizers += regs merge.updates += updates for p, c in zip(params, consts): if p not in merge.params: merge.params.append(p) merge.constraints.append(c) self.add(merge) if return_sequences: self.add(TimeDistributedDense(output_dim)) else: self.add(Dense(output_dim))
def create_model(vocab_size, args): if args.rnn == 'GRU': RNN = recurrent.GRU elif args.rnn == 'LSTM': RNN = recurrent.LSTM else: assert False, "Invalid RNN" if args.bidirectional: model = Graph() model.add_input(name="input", batch_input_shape=(args.batch_size, 1), dtype="uint") model.add_node(Embedding(vocab_size, args.embed_size, mask_zero=True), name="embed", input='input') for i in xrange(args.layers): model.add_node( RNN(args.hidden_size, return_sequences=True), name='forward' + str(i + 1), input='embed' if i == 0 else 'dropout' + str(i) if args.dropout > 0 else None, inputs=['forward' + str(i), 'backward' + str(i)] if i > 0 and args.dropout == 0 else []) model.add_node( RNN(args.hidden_size, return_sequences=True, go_backwards=True), name='backward' + str(i + 1), input='embed' if i == 0 else 'dropout' + str(i) if args.dropout > 0 else None, inputs=['forward' + str(i), 'backward' + str(i)] if i > 0 and args.dropout == 0 else []) if args.dropout > 0: model.add_node( Dropout(args.dropout), name='dropout' + str(i + 1), inputs=['forward' + str(i + 1), 'backward' + str(i + 1)]) model.add_node( TimeDistributedDense(vocab_size, activation="softmax"), name="softmax", input='dropout' + str(args.layers) if args.dropout > 0 else None, inputs=[ 'forward' + str(args.layers), 'backward' + str(args.layers) ] if args.dropout == 0 else []) model.add_output(name='output', input="softmax") else: model = Sequential() model.add(Embedding(vocab_size, args.embed_size, mask_zero=True)) for i in xrange(args.layers): model.add(RNN(args.hidden_size, return_sequences=True)) if args.dropout > 0: model.add(Dropout(args.dropout)) model.add(TimeDistributedDense(vocab_size, activation="softmax")) return model
def build_model(args): np.random.seed(args.seed) graph = Graph() graph.add_input('input', input_shape=(args.input_width, ), dtype='int') graph.add_node(build_embedding_layer(args), input='input', name='embedding') graph.add_node(LSTM(args.n_units, truncate_gradient=args.truncate_gradient, return_sequences=True), input='embedding', name='lstm0') graph.add_node(LSTM(args.n_units, truncate_gradient=args.truncate_gradient, return_sequences=True), input='lstm0', name='lstm1') # Attention module. graph.add_node(TimeDistributedDense(args.n_units, activation='relu'), input='lstm1', name='attention0') graph.add_node(TimeDistributedDense(args.n_units, activation='relu'), input='attention0', name='attention1') graph.add_node(TimeDistributedDense(args.n_units, activation='softmax'), input='attention1', name='attention2') # Apply mask from output of attention module to LSTM output. graph.add_node(TimeDistributedMerge(mode='sum'), inputs=['lstm1', 'attention2'], name='applyattn', merge_mode='mul') graph.add_node(Dense(args.n_classes, activation='softmax'), input='applyattn', name='softmax') graph.add_output(input='softmax', name='output') load_weights(args, graph) optimizer = build_optimizer(args) graph.compile(loss={'output': args.loss}, optimizer=optimizer) return graph
def build_model(glove, vocab, module_prep_model, c): s0pad = s1pad = c['spad'] max_sentences = c['max_sentences'] rnn_dim = 1 print('Model') model = Graph() # ===================== inputs of size (batch_size, max_sentences, s_pad) model.add_input('si03d', (max_sentences, s0pad), dtype=int) # XXX: cannot be cast to int->problem? model.add_input('si13d', (max_sentences, s1pad), dtype=int) if True: # TODO: if flags model.add_input('f04d', (max_sentences, s0pad, nlp.flagsdim)) model.add_input('f14d', (max_sentences, s1pad, nlp.flagsdim)) model.add_node(Reshape_((s0pad, nlp.flagsdim)), 'f0', input='f04d') model.add_node(Reshape_((s1pad, nlp.flagsdim)), 'f1', input='f14d') # ===================== reshape to (batch_size * max_sentences, s_pad) model.add_node(Reshape_((s0pad, )), 'si0', input='si03d') model.add_node(Reshape_((s1pad, )), 'si1', input='si13d') # ===================== outputs from sts _prep_model(model, glove, vocab, module_prep_model, c, c['oact'], s0pad, s1pad, rnn_dim) # out = ['scoreS1', 'scoreS2'] # ===================== reshape (batch_size * max_sentences,) -> (batch_size, max_sentences, rnn_dim) model.add_node(Reshape_((max_sentences, rnn_dim)), 'sts_in1', input='scoreS1') model.add_node(Reshape_((max_sentences, rnn_dim)), 'sts_in2', input='scoreS2') # ===================== [w_full_dim, q_full_dim] -> [class, rel] model.add_node(TimeDistributedDense(1, activation='sigmoid', W_regularizer=l2(c['l2reg']), b_regularizer=l2(c['l2reg'])), 'c', input='sts_in1') model.add_node(TimeDistributedDense(1, activation='sigmoid', W_regularizer=l2(c['l2reg']), b_regularizer=l2(c['l2reg'])), 'r', input='sts_in2') model.add_node(SumMask(), 'mask', input='si03d') # ===================== mean of class over rel model.add_node(WeightedMean(max_sentences=max_sentences), name='weighted_mean', inputs=['c', 'r', 'mask']) model.add_output(name='score', input='weighted_mean') return model
def build(max_len, embedding_dim, word2id_size, skipgram_offsets, pos2id_size, pdtbmark2id_size, pdtbpair2id_size, pdtbpair_offsets): model = Graph() loss = {} # input: word ids with masked post-padding (doc, time_pad) model.add_input(name='x_word_pad', input_shape=(None,), dtype='int') # input: word ids with random post-padding (doc, time_pad) model.add_input(name='x_word_rand', input_shape=(None,), dtype='int') # shared 1: word embedding layer (doc, time_pad, emb) model.add_node(Embedding(word2id_size, embedding_dim, input_length=max_len, init='glorot_uniform'), name='shared_1', input='x_word_pad') #XXX: mask_zero=True # shared 2: bidirectional GRU full sequence layer (doc, time_pad, repr) model.add_node(GRU(embedding_dim, return_sequences=True, activation='sigmoid', inner_activation='sigmoid', init='he_uniform', inner_init='orthogonal'), name='shared_2_fwd', input='shared_1') model.add_node(GRU(embedding_dim, return_sequences=True, activation='sigmoid', inner_activation='sigmoid', init='he_uniform', inner_init='orthogonal', go_backwards=True), name='shared_2_bck', input='shared_1') model.add_node(TimeDistributedDense(embedding_dim, init='he_uniform'), name='shared_2', inputs=['shared_1', 'shared_2_fwd', 'shared_2_bck'], merge_mode='concat') # shared 3: bidirectional GRU full sequence layer (doc, time_pad, repr) model.add_node(GRU(embedding_dim, return_sequences=True, activation='sigmoid', inner_activation='sigmoid', init='he_uniform', inner_init='orthogonal'), name='shared_3_fwd', input='shared_2') model.add_node(GRU(embedding_dim, return_sequences=True, activation='sigmoid', inner_activation='sigmoid', init='he_uniform', inner_init='orthogonal', go_backwards=True), name='shared_3_bck', input='shared_2') model.add_node(TimeDistributedDense(embedding_dim, init='he_uniform'), name='shared_3', inputs=['shared_2', 'shared_3_fwd', 'shared_3_bck'], merge_mode='concat') # shared 4: bidirectional GRU full sequence layer (doc, time_pad, repr) model.add_node(GRU(embedding_dim, return_sequences=True, activation='sigmoid', inner_activation='sigmoid', init='he_uniform', inner_init='orthogonal'), name='shared_4_fwd', input='shared_3') model.add_node(GRU(embedding_dim, return_sequences=True, activation='sigmoid', inner_activation='sigmoid', init='he_uniform', inner_init='orthogonal', go_backwards=True), name='shared_4_bck', input='shared_3') model.add_node(TimeDistributedDense(embedding_dim, init='he_uniform'), name='shared_4', inputs=['shared_3', 'shared_4_fwd', 'shared_4_bck'], merge_mode='concat') # skip-gram model: skip-gram labels (doc, time_pad, offset) skipgram_out = skipgram_model(model, ['shared_1', 'x_word_rand'], max_len, embedding_dim, word2id_size, skipgram_offsets) model.add_output(name='y_skipgram', input=skipgram_out) loss['y_skipgram'] = 'mse' # POS model: POS tags (doc, time_pad, pos2id) pos_out = pos_model(model, ['shared_2'], max_len, embedding_dim, pos2id_size) model.add_output(name='y_pos', input=pos_out) loss['y_pos'] = 'binary_crossentropy' # PDTB marking model: discourse relation boundary markers (doc, time, offset, pdtbmark2id) pdtbmark_out = pdtbmark_model(model, ['shared_3'], max_len, embedding_dim, pdtbmark2id_size) model.add_output(name='y_pdtbmark', input=pdtbmark_out) loss['y_pdtbmark'] = 'binary_crossentropy' # PDTB pairs model: discourse relation span-pair occurrences (doc, time, offset, pdtbpair2id) pdtbpair_out = pdtbpair_model(model, ['shared_4'], max_len, embedding_dim, pdtbpair2id_size, pdtbpair_offsets) model.add_output(name='y_pdtbpair', input=pdtbpair_out) loss['y_pdtbpair'] = 'binary_crossentropy' model.compile(optimizer='rmsprop', loss=loss) return model
def LSTMModel(self, nHidden=150, lr=0.01): # print('nHidden: %i\tlr: %.3f' % ( nHidden, lr) ) self.rnnModel.add( GRU(nHidden, activation='sigmoid', input_shape=(None, self.maxFeatures), return_sequences=True)) # self.rnnModel.add(LSTM( nHidden, activation='sigmoid', input_shape =( None, nHidden), return_sequences=True)) self.rnnModel.add(TimeDistributedDense(nHidden)) self.rnnModel.add(Activation('relu')) self.rnnModel.add(TimeDistributedDense(self.maxFeatures)) self.rnnModel.add(Activation('softmax')) rmsprop = RMSprop(lr=lr, rho=0.9, epsilon=1e-06) self.rnnModel.compile(loss='categorical_crossentropy', optimizer=rmsprop)
def test_seq_to_seq(self): print('sequence to sequence data:') (X_train, y_train), (X_test, y_test) = get_test_data(nb_train=1000, nb_test=200, input_shape=(3, 5), output_shape=(3, 5), classification=False) print('X_train:', X_train.shape) print('X_test:', X_test.shape) print('y_train:', y_train.shape) print('y_test:', y_test.shape) model = Sequential() model.add( TimeDistributedDense(y_train.shape[-1], input_shape=(X_train.shape[1], X_train.shape[2]))) model.compile(loss='hinge', optimizer='rmsprop') history = model.fit(X_train, y_train, nb_epoch=12, batch_size=16, validation_data=(X_test, y_test), verbose=0) self.assertTrue(history.history['val_loss'][-1] < 0.8)
def get_RNN_model(in_shape, td_num=512, ltsm_out_dim=256, nb_hidden=100, drop1=0.5, drop2=0.5): model = Sequential() model.add(GaussianNoise(0.05, input_shape=in_shape)) model.add(TimeDistributedDense(td_num)) model.add(LSTM(ltsm_out_dim, return_sequences=True)) reg = l2(0.05) # model.add(TimeDistributedDense(td_num, W_regularizer=l2(0.03))) #reg.set_param(model.layers[3].get_params()[0][0]) #model.layers[3].regularizers = [reg] model.add(Dropout(drop1)) model.add(LSTM(ltsm_out_dim)) # reg = l2(0.05) # reg.set_param(model.layers[3].get_params()[0][0]) # model.layers[3].regularizers = [reg] model.add(Dropout(drop1)) # model.regularizers = [l2(0.05)] #model.add(Activation('relu')) model.add(Flatten()) model.add(Dense(nb_hidden, W_regularizer=l2(0.05))) model.add(Activation('relu')) model.add(Dropout(drop2)) model.add(Dense(1)) model.add(Activation('linear')) model.compile(loss='mse', optimizer='rmsprop') return model
def create_neural_network(freq_dimensions, hidden_dimensions, rec_units=1): model = Sequential() model.add( TimeDistributedDense(input_dim=freq_dimensions, output_dim=hidden_dimensions)) for i in range(rec_units): model.add( LSTM(input_dim=freq_dimensions, output_dim=hidden_dimensions, return_sequences=True)) model.add( TimeDistributedDense(input_dim=hidden_dimensions, output_dim=freq_dimensions)) model.compile(loss='mean_squared_error', optimizer='rmsprop') return model
def build_model(): print('Build model...') graph = Graph() graph.add_input(name='input', ndim=3) graph.add_node(GRU(len_circ_repr, 128, return_sequences=True), name='gru1', input='input') graph.add_node(TimeDistributedDense(128, 128), name='tdd', input='gru1') graph.add_node(GRU(128, 128, return_sequences=False), name='gru2', input='tdd') graph.add_node(Dense(128, 120, activation='softmax'), name='seconds', input='gru2') graph.add_node(Dense(128, 120, activation='softmax'), name='minutes', input='gru2') graph.add_output(name='out1', input='seconds') graph.add_output(name='out2', input='minutes') print('Compile model...') graph.compile('rmsprop', { 'out1': 'categorical_crossentropy', 'out2': 'categorical_crossentropy' }) return graph
def creat_binary_tag_LSTM( sourcevocabsize,targetvocabsize, source_W,input_seq_lenth ,output_seq_lenth , hidden_dim ,emd_dim,loss='categorical_crossentropy',optimizer = 'rmsprop'): encoder_a = Sequential() encoder_b = Sequential() encoder_c = Sequential() l_A_embedding = Embedding(input_dim=sourcevocabsize+1, output_dim=emd_dim, input_length=input_seq_lenth, mask_zero=True, weights=[source_W]) encoder_a.add(l_A_embedding) encoder_a.add(Dropout(0.3)) encoder_b.add(l_A_embedding) encoder_b.add(Dropout(0.3)) encoder_c.add(l_A_embedding) Model = Sequential() encoder_a.add(LSTM(hidden_dim,return_sequences=True)) encoder_b.add(LSTM(hidden_dim,return_sequences=True,go_backwards=True)) encoder_rb = Sequential() encoder_rb.add(ReverseLayer2(encoder_b)) encoder_ab=Merge(( encoder_a,encoder_rb),mode='concat') Model.add(encoder_ab) decodelayer=LSTMDecoder_tag(hidden_dim=hidden_dim, output_dim=hidden_dim , input_length=input_seq_lenth, output_length=output_seq_lenth, state_input=False, return_sequences=True) Model.add(decodelayer) Model.add(TimeDistributedDense(targetvocabsize+1)) Model.add(Activation('softmax')) Model.compile(loss=loss, optimizer=optimizer) return Model
def prep_model(model, N, s0pad, s1pad, c): winputs = ['e0', 'e1'] if c['wproject']: model.add_shared_node(name='wproj', inputs=winputs, outputs=['e0w', 'e1w'], layer=TimeDistributedDense(output_dim=int(N*c['wdim']), activation=c['wact'])) winputs = ['e0w', 'e1w'] model.add_shared_node(name='bow', inputs=winputs, outputs=['e0b', 'e1b'], layer=TimeDistributedMerge(mode='ave')) bow_last = ('e0b', 'e1b') for i in range(c['deep']): bow_next = ('e0b[%d]'%(i,), 'e1b[%d]'%(i,)) model.add_shared_node(name='deep[%d]'%(i,), inputs=bow_last, outputs=bow_next, layer=Dense(output_dim=N, init=c['nninit'], activation=c['nnact'], W_regularizer=l2(c['l2reg']))) bow_last = bow_next # Projection if c['project']: model.add_shared_node(name='proj', inputs=bow_last, outputs=['e0p', 'e1p'], layer=Dense(input_dim=N, output_dim=int(N*c['pdim']), activation=c['pact'], W_regularizer=l2(c['l2reg']))) return ('e0p', 'e1p') else: return bow_last
def VGG19_hieratt(query_in_size, query_embed_size, nb_classes): """Stack hierarchical attention on pre-trained VGG19. Requires https://github.com/fchollet/deep-learning-models""" base_model = VGG19(weights='imagenet') input_image = base_model.input input_question = Input(shape=(query_in_size, )) # question vector # Model up to 3rd block f_1 = Model(input=img_in, output=base_model.get_layer('block3_pool').output) f_1 = f_1(img_in) f_1 = Reshape((256, 28 * 28))(f_1) f_1 = Permute((2, 1))(f_1) q_1 = Dense(query_embed_size, activation='relu')(input_question) # Encode question # Add question embedding to each feature column q_1 = RepeatVector(28 * 28)(q_1) q_f = merge([f_1, q_1], 'concat') # Estimate and apply attention per feature att_1 = TimeDistributedDense(1, activation="sigmoid")(q_f) att_1 = Lambda(repeat_1, output_shape=(28 * 28, 256))(att_1) att_1 = merge([f_1, att_1], 'mul') # Reshape to the original feature map from previous layer att_1 = Permute((2, 1))(att_1) f_1_att = Reshape((256, 28, 28))(att_1) model = Model(input=[img_in, input_question], output=f_1_att) print model.summary()
def make_dense(X, y, num_layers, width, dropout): assert len(X.shape) == 2 assert len(y.shape) == 2 vocab_size = np.amax(X) + 1 print 'Vocab size:', vocab_size m = Sequential() m.add(Embedding(vocab_size, 8)) m.add(Dropout(dropout)) m.add(TimeDistributedDense(8, 64)) m.add(Flatten()) m.add(BatchNormalization((64 * X.shape[1],))) m.add(PReLU((64 * X.shape[1],))) m.add(Dropout(dropout)) m.add(Dense(64 * X.shape[1], width)) for i in range(num_layers): m.add(BatchNormalization((width,))) m.add(PReLU((width,))) m.add(Dropout(dropout)) m.add(Dense(width, width)) m.add(BatchNormalization((width,))) m.add(PReLU((width,))) m.add(Dropout(dropout)) m.add(Dense(width, y.shape[1])) m.add(Activation('softmax')) return m, 1
def rel_types_model(model, ins, max_len, embedding_dim, rel_types2id_size, focus, pre='rtypes'): """Discourse relation types model as Keras Graph.""" # prepare focus dimensionality model.add_node(RepeatVector(rel_types2id_size), name=pre + '_focus_rep', input=focus) model.add_node(Permute((2, 1)), name=pre + '_focus', input=pre + '_focus_rep') # discourse relation types dense neural network (sample, time_pad, rel_types2id) model.add_node(TimeDistributedDense(rel_types2id_size, init='he_uniform'), name=pre + '_dense', input=ins[0]) model.add_node(Activation('softmax'), name=pre + '_softmax', input=pre + '_dense') # multiplication to focus the activations (doc, time_pad, rel_types2id) model.add_node(Activation('linear'), name=pre + '_out', inputs=[pre + '_focus', pre + '_softmax'], merge_mode='mul') return pre + '_out'
def build_net(): ng = NumberGenerator(lambda x, y: x + y) # parameters n_epochs = 10 training_size = 50000 rnn = recurrent.LSTM hidden_size = 128 batch_size = 128 layers = 1 print('Building model...') model = Sequential() model.add(rnn(hidden_size)) for _ in range(layers): model.add(rnn(hidden_size, return_sequences=True)) model.add(TimeDistributedDense(2)) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam') for epoch, epoch_data in enumerate( ng.generate_data(n_epochs, size=training_size)): print('\n' + '-' * 50 + '\nIteration %d', epoch) model.fit(epoch_data[0], epoch_data[1], batch_size=batch_size, show_accuracy=True)
def get_enc2dec(RNN, HIDDEN_SIZE=128, LAYERS=1, DIM=100, MAXLEN=100): """ Enc-Dec Model see Vinyals et. al. 2014 http://arxiv.org/pdf/1412.7449v1.pdf """ model = Graph() model.add_input(name='input', input_shape=(None, DIM)) model.add_node(RNN(HIDDEN_SIZE, return_sequences=True), name='e_r0', input='input') prev_node = 'e_r0' for layer in xrange(LAYERS - 1): model.add_node(RNN(HIDDEN_SIZE, return_sequences=True), name='e_r' + str(layer + 1), input=prev_node) prev_node = 'e_r' + str(layer + 1) model.add_node(RNN(HIDDEN_SIZE), name='e_final', input=prev_node) model.add_node(RepeatVector(MAXLEN), name='encoder', input='e_final') prev_node = 'encoder' for layer in xrange(LAYERS - 1): model.add_node(RNN(HIDDEN_SIZE, return_sequences=True), name='d_r' + str(layer + 1), input=prev_node) prev_node = 'd_r' + str(layer + 1) model.add_node(TimeDistributedDense(MAXLEN), name='d_tdd', input=prev_node) model.add_node(Activation('softmax'), name='softmax', input='d_tdd') model.add_output(name='output', input='softmax') return model
def build_RNN_model(vocab_size, embedding_dims, rnn_layer_dim, num_classes): """Build the RNN model""" model = Sequential() # Sequential model # Embedding layer model.add(Embedding(vocab_size, embedding_dims)) # Recurrent layer model.add( SimpleRNN(int(rnn_layer_dim), init='glorot_uniform', inner_init='orthogonal', activation='tanh', W_regularizer=None, U_regularizer=None, b_regularizer=None, dropout_W=0.0, dropout_U=0.0, return_sequences=True, stateful=False)) # Time distributed dense layer (activation is softmax, since it is a classification problem) model.add( TimeDistributedDense(num_classes, init='glorot_uniform', activation='softmax')) return model
def build_model(self, params): hidden_layers = params['hidden_layers'] input_dim = params['feat_size'] output_dim = params['phone_vocab_size'] drop_prob = params['drop_prob_encoder'] self.nLayers = len(hidden_layers) # First Layer is an encoder layer self.model.add(TimeDistributedDense(hidden_layers[0], init='glorot_uniform', input_dim=input_dim)) self.model.add(Dropout(drop_prob)) # Second Layer is the Recurrent Layer if params.get('recurrent_type','simple') == 'simple': self.model.add(SimpleRNN(hidden_layers[1], init='glorot_uniform', inner_init='orthogonal', activation='sigmoid', weights=None, truncate_gradient=-1, return_sequences=False, input_dim=hidden_layers[0], input_length=None)) elif params.get('recurrent_type','simple') == 'lstm': self.model.add(LSTM(hidden_layers[1], init='glorot_uniform', inner_init='orthogonal', input_dim=hidden_layers[0], input_length=None)) # Then we add dense projection layer to map the RNN outputs to Vocab size self.model.add(Dropout(drop_prob)) self.model.add(Dense(output_dim, input_dim=hidden_layers[1], init='uniform')) self.model.add(Activation('softmax')) self.solver = getSolver(params) self.model.compile(loss='categorical_crossentropy', optimizer=self.solver) #score = model.evaluate(test_x) self.f_train = self.model.train_on_batch return self.f_train
def test_sequence_to_sequence(): ''' Apply a same Dense layer for each element of time dimension of the input and make predictions of the output sequence elements. This does not make use of the temporal structure of the sequence (see TimeDistributedDense for more details) ''' np.random.seed(1337) (X_train, y_train), (X_test, y_test) = get_test_data(nb_train=500, nb_test=200, input_shape=(3, 5), output_shape=(3, 5), classification=False) model = Sequential() model.add( TimeDistributedDense(y_train.shape[-1], input_shape=(X_train.shape[1], X_train.shape[2]))) model.compile(loss='hinge', optimizer='rmsprop') history = model.fit(X_train, y_train, nb_epoch=20, batch_size=16, validation_data=(X_test, y_test), verbose=0) assert (history.history['val_loss'][-1] < 0.8)
def train_model(dataset, h0_dim, h1_dim, out_dim): X_train, y_train, X_test, y_test = dataset batch_size = 128 nb_epoch = 100 model = Sequential() model.add( RNN(h0_dim, input_shape=(None, X_train.shape[-1]), return_sequences=True)) model.add(TimeDistributedDense(out_dim)) model.add(Activation("linear")) model.compile(loss="mse", optimizer="rmsprop") #model.get_config(verbose=1) #yaml_string = model.to_yaml() #with open('ifshort_mlp.yaml', 'w') as f: # f.write(yaml_string) early_stopping = EarlyStopping(monitor='val_loss', patience=10) checkpointer = ModelCheckpoint(filepath="/tmp/ifshort_rnn_weights.hdf5", verbose=1, save_best_only=True) model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=nb_epoch, show_accuracy=False, verbose=2, validation_data=(X_test, y_test), callbacks=[early_stopping, checkpointer])
def train_rnn(character_corpus, seq_len, train_test_split_ratio): model = Sequential() model.add(Embedding(character_corpus.char_num(), 256)) model.add(LSTM(256, 5120, activation='sigmoid', inner_activation='hard_sigmoid', return_sequences=True)) model.add(Dropout(0.5)) model.add(TimeDistributedDense(5120, character_corpus.char_num())) model.add(Activation('time_distributed_softmax')) model.compile(loss='categorical_crossentropy', optimizer='rmsprop') seq_X, seq_Y = character_corpus.make_sequences(seq_len) print "Sequences are made" train_seq_num = train_test_split_ratio*seq_X.shape[0] X_train = seq_X[:train_seq_num] Y_train = to_time_distributed_categorical(seq_Y[:train_seq_num], character_corpus.char_num()) X_test = seq_X[train_seq_num:] Y_test = to_time_distributed_categorical(seq_Y[train_seq_num:], character_corpus.char_num()) print "Begin train model" checkpointer = ModelCheckpoint(filepath="model.step", verbose=1, save_best_only=True) model.fit(X_train, Y_train, batch_size=256, nb_epoch=100, verbose=2, validation_data=(X_test, Y_test), callbacks=[checkpointer]) print "Model is trained" score = model.evaluate(X_test, Y_test, batch_size=512) print "valid score = ", score return model
def __init__(self, output_dim, hidden_dim, output_length, depth=1, dropout=0.25, **kwargs): super(SimpleSeq2seq, self).__init__() if type(depth) not in [list, tuple]: depth = (depth, depth) self.encoder = LSTM(hidden_dim, **kwargs) self.decoder = LSTM(hidden_dim if depth[1] > 1 else output_dim, return_sequences=True, **kwargs) for i in range(1, depth[0]): self.add(LSTM(hidden_dim, return_sequences=True, **kwargs)) self.add(Dropout(dropout)) self.add(self.encoder) self.add(Dropout(dropout)) self.add(RepeatVector(output_length)) self.add(self.decoder) for i in range(1, depth[1]): self.add(LSTM(hidden_dim, return_sequences=True, **kwargs)) self.add(Dropout(dropout)) if depth[1] > 1: self.add(TimeDistributedDense(output_dim))
def train_seq2seq(self): print "Input sequence read, starting training" #X_train = sequence.pad_sequences(self.X_train, maxlen=self.maxlen) #Y_train = sequence.pad_sequences(self.Y_train, maxlen=self.maxlen) #X_val = sequence.pad_sequences(self.X_val, maxlen=self.maxlen) #y_val = sequence.pad_sequences(self.Y_val, maxlen=self.maxlen) #X_test = sequence.pad_sequences(self.X_test, maxlen=self.maxlen) #Y_test = sequence.pad_sequences(self.Y_test, maxlen=self.maxlen) model = Sequential() model.add( Embedding(len(self.proproces.vocab_hind), 30, input_length=self.maxlen)) model.add(RNN(30)) #, input_shape=(100, 128))) model.add(RepeatVector(self.maxlen)) model.add(RNN(30, return_sequences=True)) model.add(TimeDistributedDense(len(self.proproces.vocab_en))) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) for e in range(100): print("epoch %d" % e) for (X, Y) in self.proproces.gen_seq( "../indian-parallel-corpora/hi-en/tok/dev.hi-en.en.0", "../indian-parallel-corpora/hi-en/tok/dev.hi-en.hi", 128): loss, acc = model.train_on_batch( X, Y) #, batch_size=64, nb_epoch=1) print("Loss is %f, accuracy is %f " % (loss, acc)) # After one epoch test one sentence if e % 5 == 0: print("Enter sentence in hindi") inp = raw_input().decode("utf-8") tokens = inp.split() seq = [] for token in tokens: if token in self.proproces.vocab_hind: seq.append(self.proproces.vocab_hind[token]) else: token = "UNK" seq.append(self.proproces.vocab_hind[token]) #seq = map(lambda x:self.proproces.vocab_hind[x], tokens) # Normalize seq to maxlen X = [] X.append(seq) print X temp = sequence.pad_sequences(X, maxlen=self.maxlen) #temp[0:len(seq)] = seq print len(temp) #temp = np.asarray(temp).reshape(128,) print temp.shape prob = model.predict_on_batch( temp) #, batch_size=1, verbose=0) translated = self.decode(prob) print("Tranlated is", translated) print("Probabilities are", prob) print("Shape of prob tensor is", prob.shape)
def __init__(self, positive_weight, _num_of_hidden_units): super(LSTM_CNN_EEG, self).__init__() self.positive_weight = positive_weight self._num_of_hidden_units = _num_of_hidden_units ''' define the neural network model: ''' # from keras.layers.extra import * import numpy as np from keras.models import Sequential from keras.layers.recurrent import LSTM from keras.layers.core import Dense, Dropout, Activation from keras.regularizers import l2 from keras.datasets import mnist from keras.models import Sequential # from keras.initializations import norRemal, identity from keras.layers.recurrent import SimpleRNN, LSTM, GRU from keras.optimizers import RMSprop, Adadelta from keras.layers.convolutional import Convolution2D, Convolution1D from keras.layers.core import Dense, Activation, TimeDistributedDense, Dropout, Reshape, Flatten # from keras.layers.wrappers import TimeDistributed from keras.models import model_from_json from keras.layers.convolutional import MaxPooling1D, MaxPooling2D from keras.layers.core import Permute size = 28 maxToAdd = 200 # define our time-distributed setup model = Sequential() model.add(TimeDistributedDense(10, input_shape=(maxToAdd, 55))) # model.add(Convolution2D(1, 1, 10, border_mode='valid', input_shape=(1,maxToAdd, 55))) model.add(Activation('tanh')) model.add(Reshape( (1, maxToAdd, 10))) # this line updated to work with keras 1.0.2 model.add(Convolution2D(3, 20, 1, border_mode='valid')) # org model.add(Activation('tanh')) model.add(Convolution2D(1, 1, 1, border_mode='same')) # org model.add(Activation('tanh')) model.add(MaxPooling2D(pool_size=(20, 1), border_mode='valid')) model.add(Permute((2, 1, 3))) model.add(Reshape( (9, 10))) # this line updated to work with keras 1.0.2 model.add(GRU(output_dim=20, return_sequences=False)) # model.add(Dense(2, activation='softmax')) model.compile(optimizer='rmsprop', loss='categorical_crossentropy') self.model = model # model.predict(np.random.rand(28, 200, 55).astype(np.float32)).shape print model.layers[-1].output_shape # print "2 {} {}".format(model.layers[1].output_shape[-3:], (1, maxToAdd, np.prod(model.layers[1].output_shape[-3:]))) self.original_weights = self.model.get_weights() """ :type Sequential"""
def train_seq2seq(self): print "Input sequence read, starting training" #X_train = sequence.pad_sequences(self.X_train, maxlen=self.maxlen) #Y_train = sequence.pad_sequences(self.Y_train, maxlen=self.maxlen) #X_val = sequence.pad_sequences(self.X_val, maxlen=self.maxlen) #y_val = sequence.pad_sequences(self.Y_val, maxlen=self.maxlen) #X_test = sequence.pad_sequences(self.X_test, maxlen=self.maxlen) #Y_test = sequence.pad_sequences(self.Y_test, maxlen=self.maxlen) model = Sequential() #model.add(Embedding(len(self.proproces.vocab_hind), 100, # input_length=self.maxlen)) model.add( RNN(80, input_shape=(self.maxlen, len(self.proproces.vocab_hind)))) model.add(RepeatVector(self.maxlen)) model.add(RNN(80, return_sequences=True)) model.add(TimeDistributedDense(len(self.proproces.vocab_en))) model.add(Activation('softmax')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) for e in range(1, 2000): print("epoch %d" % e) for (X, Y) in self.proproces.gen_seq( "../indian-parallel-corpora/hi-en/tok/dev.hi-en.en.0", "../indian-parallel-corpora/hi-en/tok/dev.hi-en.hi", 64): loss, acc = model.train_on_batch( X, Y) #, batch_size=64, nb_epoch=1) print("Loss is %f, accuracy is %f " % (loss, acc)) # After one epoch test one sentence if e % 10 == 0: print("Enter sentence in hindi") inp = raw_input().decode("utf-8") tokens = inp.split() seq = [] for token in tokens: if token in self.proproces.vocab_hind: seq.append(self.proproces.vocab_hind[token]) else: token = "UNK" seq.append(self.proproces.vocab_hind[token]) #seq = map(lambda x:self.proproces.vocab_hind[x], tokens) # Normalize seq to maxlen X = [] x = [] temp = [0] * (self.maxlen) temp[0:len(seq)] = seq for ind in temp: t = [0] * len(self.proproces.vocab_hind) t[ind] = 1 x.append(t) X.append(x) X = np.asarray(X) print len(X) prob = model.predict(X) self.decode(prob) print("Probabilities are", prob)
def test_lstm(): # load wiki data X_train_np, X_valid_np, X_test_np = gen_data_wiki() batchsize = 100 blocklength = 25000 #450000 bsize_test = batchsize numframe = 100 numframe_test = 1250#2500#5000 X_valid = onehot(X_valid_np).reshape(bsize_test, X_valid_np.shape[0]/bsize_test, 205) X_test = onehot(X_test_np).reshape(bsize_test, X_test_np.shape[0]/bsize_test, 205) nb_classes= 205 X_train_shared = theano.shared(np.zeros((batchsize,blocklength, nb_classes)).astype('float32'), name = 'train_set', borrow=True) X_valid_shared = theano.shared(np.zeros((bsize_test, numframe_test, nb_classes)).astype('float32'), name = 'valid_set', borrow=True) X_test_shared = theano.shared(np.zeros((bsize_test, numframe_test, nb_classes)).astype('float32'), name = 'test_set', borrow=True) # build the model from keras.layers.recurrent import LSTM, SimpleRNN, LSTMgrave from layer_icml import LSTM_bu, LSTM_td, RNN_td, RNN_bu, RNN_sh, RNN_dp, LSTM_dp, RNN_shallow from layer_icml import RNN_relugate, RNN_ens, RNN_2tanh, RNN_ntanh, RNN_multidp, LSTM_multi, LSTM_u, RNN_utanh, LSTM_uu, LSTM_uugrave from keras.layers.core import Dense, Activation, TimeDistributedDense from keras.initializations import normal, identity x = T.tensor3() y = T.matrix() name_init = 'uniform' n_h = 2450; L1 = LSTMgrave(output_dim = n_h, init = 'uniform', batchsize = batchsize, inner_init = 'uniform',input_shape = (None, nb_classes), return_sequences=True); name_model= 'lstm_shallowgrave_' + str(n_h) + name_init + '0.01'+ '_batchsize' + str(batchsize) + '_numframe' + str(numframe) # RNN name_act = 'tanh'; name_init = 'uniform' #n_h=2048;L1 = RNN_shallow(output_dim = n_h, init = 'uniform', U_init = name_init, activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_tanh" + str(n_h) + "_"+name_act+ name_init + '0.1' #n_h = 2048;L1 = SimpleRNN(output_dim = n_h, init = 'uniform', inner_init = 'uniform', activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_shallow"+str(n_h)+name_act+ name_init + '0.05' #n_h = 4096;L1 = RNN_utanh(output_dim = n_h, init = 'uniform', U_init = name_init, activation = name_act, input_shape = (None, nb_classes), return_sequences=True);name_model = "rnn_utanh_2_0_0" + str(n_h) + "_"+name_act+ name_init +'0.01' n_h = 2048; in_act = 'tanh';L1 = LSTM_uugrave(output_dim = n_h, batchsize = batchsize, init = 'uniform', inner_init = 'uniform', input_shape = (None, nb_classes), return_sequences=True); name_model= 'lstm_u_grave'+in_act+'_1.0_1.0_1.0_0' + str(n_h) + name_init + '0.01' + '_batchsize' + str(batchsize) + '_numframe' + str(numframe) #n_h = 1200; in_act = 'tanh';L2 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform', input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_stack2'+in_act+'_1.0_1.0_1.0_0' + str(n_h) + name_init + '0.01' #n_h = 700; L2 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03' #n_h = 700; L3 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03' #n_h = 700; L4 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= 'lstm_u_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03' #n_h = 700; L5 = LSTM_uu(output_dim = n_h, init = 'uniform', inner_init = 'uniform',input_shape = (None, n_h), return_sequences=True); name_model= '7005layerlstm_uu_1.0_1.0_0.5_0' + str(n_h) + name_init + '0.03' D1 = TimeDistributedDense(nb_classes);D1._input_shape = [None, None, n_h] O = Activation('softmax') #layers = [L1, L2, L3, L4, L5, D1, O] layers = [L1, D1, O] #layers = [L1, L2, D1, O] load_model = True if load_model: #f_model = open('/data/lisatmp3/zhangsa/lstm/models/180rnn_td_reluidentityotherinit_identity_sgd0.1_clip10.pkl', 'rb') #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtestfinetune5e-4inorder_withtest.pkl', 'rb') #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.pkl', 'rb') #f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.pkl', 'rb') f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/wiki100lstm_u_gravetanh_1.0_0.5_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtestfinetune1e-5inorder_withtest.pkl', 'rb') layers = pickle.load(f_model) f_model.close() name_model_load = 'wiki100lstm_u_gravetanh_1.0_0.5_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest' + 'finetune2e-6' #name_perpmat_load = 'wiki100lstm_u_gravetanh_1.0_1.0_1.0_02048uniform0.01_batchsize100_numframe100_adam0.001inorder_withtest.npy' L1 = layers[0] out = x params = [] for l in layers: if not load_model: l.build() l.input = out params += l.params if l == L1: out = l.get_output()[0] h0 = l.get_output()[0] c0 = l.get_output()[1] else: out = l.get_output() # compute the loss loss = -T.mean(T.log(out)[:,:numframe-1,:] *x[:,1:,:]) logperp_valid = T.mean(-T.log2(T.sum(out[:,:numframe_test-1,:]*x[:,1:,:],axis=2))) logperp_train = T.mean(-T.log2(T.sum(out[:,:numframe-1,:]*x[:,1:,:],axis=2))) # set optimizer from keras.constraints import identity as ident from keras.optimizers import RMSprop, SGD, Adam lr_ = 2*1e-6 clipnorm_ = 10000 rmsprop = RMSprop(lr=lr_, clipnrom = clipnorm_) sgd = SGD(lr=lr_, momentum=0.9, clipnorm=clipnorm_) adam = Adam(lr=lr_) #opt = sgd; name_opt = 'sgd'+str(lr_); clip_flag = False #opt = rmsprop; name_opt = 'rmsprop'+str(lr_) opt = adam; name_opt = 'adam' + str(lr_); clip_flag = False if clip_flag: name_opt = name_opt + '_clip'+str(clipnorm_) #param update for regular parameters constraints = [ident() for p in params] updates = opt.get_updates(params, constraints, loss) index = T.iscalar() f_train = theano.function([index], [loss, h0, c0], updates = updates, givens={x:X_train_shared[:,index*numframe : (index+1)*numframe, :]}) # perplexity function f_perp_valid = theano.function([], [logperp_valid, h0, c0], givens={x:X_valid_shared}) f_perp_test = theano.function([], [logperp_valid, h0, c0], givens={x:X_test_shared}) #f_perp_valid = theano.function([index], [logperp_valid], givens={x:X_valid_shared[index*bsize_test : (index+1)*bsize_test]}) #f_perp_test = theano.function([index], [logperp_valid], givens={x:X_test_shared[index*bsize_test : (index+1)*bsize_test]}) def perp_valid(): logperp_acc = 0 n = 0 L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32')) L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32')) for k in xrange(X_valid.shape[1]/numframe_test): X_valid_shared.set_value(X_valid[:, k*numframe_test:(k+1)*numframe_test, :]) perp, h0, c0 = f_perp_valid() logperp_acc += perp L1.H0.set_value(h0[:,-1,:]) L1.C0.set_value(c0[:,-1,:]) n += 1 return (logperp_acc/n) def perp_test(): logperp_acc = 0 n = 0 L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32')) L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32')) for k in xrange(X_test.shape[1]/numframe_test): X_test_shared.set_value(X_test[:, k*numframe_test:(k+1)*numframe_test, :]) perp, h0, c0 = f_perp_test() logperp_acc += perp L1.H0.set_value(h0[:,-1,:]) L1.C0.set_value(c0[:,-1,:]) n += 1 return (logperp_acc/n) #def perp_valid(): # logperp_acc = 0 # n = 0 # for k in xrange(X_valid_np.shape[0]/(bsize_test*numframe_test)): # X_valid_shared.set_value(onehot(X_valid_np[k*bsize_test*numframe_test:(k+1)*bsize_test*numframe_test]).reshape((bsize_test, numframe_test, 205))) # for i in xrange(X_valid_shared.get_value().shape[0]/bsize_test): # logperp_acc += f_perp_valid(i) # n += 1 # return (logperp_acc/n) #def perp_test(): # logperp_acc = 0 # n = 0 # for k in xrange(X_test_np.shape[0]/(bsize_test*numframe_test)): # X_test_shared.set_value(onehot(X_test_np[k*bsize_test*numframe_test:(k+1)*bsize_test*numframe_test]).reshape((bsize_test, numframe_test, 205))) # for i in xrange(X_test_shared.get_value().shape[0]/bsize_test): # logperp_acc += f_perp_test(i) # n += 1 # return (logperp_acc/n) ######## testmodel ######## #test_score = perp_valid() #pdb.set_trace() epoch_ = 9000 perpmat = np.zeros((epoch_, 3)) t_start = time.time() name = 'wiki100'+ name_model + '_' + name_opt if load_model: name = name_model_load #perpmat = np.load(name_perpmat_load) #only_block = False #if only_block: # name = name + 'random_only_block' #else: # name = name + 'random_per_row_in_block' name = name+'inorder' blocksize = batchsize*blocklength bestscore = 100000000 for epoch in xrange(epoch_): for k in xrange(X_train_np.shape[0]/blocksize): t_s = time.time() print "reloading " + str(k) + " th train patch..." #if only_block: # pos = np.random.randint(0, X_train_np.shape[0]-blocksize) # X_train_shared.set_value(onehot(X_train_np[pos: pos + blocksize]).reshape(batchsize, blocklength, 205)) #else: # pos = np.random.randint(0, X_train_np.shape[0]-blocklength, batchsize) # tmp = np.zeros((batchsize, blocklength, 205)).astype('float32') # for j in xrange(batchsize): # tmp[j] = onehot(X_train_np[pos[j]: pos[j] + blocklength]) # X_train_shared.set_value(tmp) X_train_shared.set_value(onehot(X_train_np[k*blocksize: (k+1)*blocksize]).reshape(batchsize, blocklength, 205)) print "reloading finished, time cost: " + str(time.time()-t_s) L1.H0.set_value(np.zeros((batchsize, n_h)).astype('float32')) L1.C0.set_value(np.zeros((batchsize, n_h)).astype('float32')) for i in xrange(blocklength/numframe): loss, h0, c0 = f_train(i) L1.H0.set_value(h0[:,-1,:]) L1.C0.set_value(c0[:,-1,:]) if i%10 == 0: t_end = time.time() print "Time consumed: " + str(t_end - t_start) + " secs." t_start = time.time() print "Epoch "+ str(epoch)+" " + name + ": The training loss in batch " + str(k*(blocklength/numframe)+i) +" is: " + str(loss) + "." if k%6 == 0: #save results m = epoch*X_train_np.shape[0]/(blocksize*6) +k/6 perpmat[m][0], perpmat[m][1] = 0, perp_valid() perpmat[m][2] = perp_test() np.save('/data/lisatmp4/zhangsa/rnn_trans/results/' + name +'_withtest.npy', perpmat) #save model if perpmat[m][1] < bestscore: bestscore = perpmat[m][1] f_model = open('/data/lisatmp4/zhangsa/rnn_trans/models/' + name + '_withtest.pkl', 'wb+') pickle.dump(layers, f_model) f_model.close() print "Epoch "+ str(epoch)+ " " + name + ": The training perp is: " + str(perpmat[epoch][0]) \ + ", test perp is: " + str(perpmat[epoch][1]) + "."