def TrainLstmCrf(data_name, model_name): n_classes = 4 max_len = 75 batch_size = 128 epoch = 100 tags = ['S', 'B', 'I', 'E'] sentences, words = get_sents(datasets=data_name) print(len(sentences), len(words)) word2idx = {w: i + 1 for i, w in enumerate(words)} tag2idx = {t: i for i, t in enumerate(tags)} vocab_size = len(words) X = [[word2idx[w[0]] for w in s] for s in sentences] X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=vocab_size - 1) y = [[tag2idx[w[1]] for w in s] for s in sentences] y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["E"]) y = [to_categorical(i, num_classes=n_classes) for i in y] # 获得数据 X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1) print(len(X_tr), len(y_tr), len(X_te), len(y_te)) s = np.asarray([max_len] * batch_size, dtype='int32') # 建立模型 word_ids = Input(batch_shape=(batch_size, max_len), dtype='int32') sequence_lengths = Input(batch_shape=[batch_size, 1], dtype='int32') print(sequence_lengths) word_embeddings = Embedding(vocab_size, n_classes)(word_ids) blstm = Bidirectional(LSTM(units=50, return_sequences=True))(word_embeddings) model = TimeDistributed(Dense(4, activation='tanh'))(blstm) crf = CrfModel() pred = crf(inputs=[model, sequence_lengths]) model = Model(inputs=[word_ids, sequence_lengths], outputs=[pred]) print("word_ids:{}".format(word_ids)) print("sequence_lengths:{}".format(sequence_lengths)) model.compile(optimizer="rmsprop", loss=crf.loss, metrics=['accuracy']) print(model.summary()) k = 0 for batch_x, batch_y in minibatches(X_tr, y_tr, batch_size=batch_size): model.fit([batch_x, s], np.array(batch_y), epochs=epoch, batch_size=batch_size) k += 1 if k % 50 == 0: model.save("./models/{}_{}".format(k, model_name)) print("saved") # 保存模型 model.save(model_name)
def new_model(image_size=299, video_length=40, cnn_trainable=False): inputs = Input(shape=(video_length, image_size, image_size, 3)) cnn = inception_v3.InceptionV3(include_top=False, weights='imagenet') model = TimeDistributed(cnn)(inputs) model.trainable = cnn_trainable model = LSTM(512)(model) model = Dropout(0.5)(model) model = Dense(1, activation='softmax')(model) model = Model(inputs=inputs, outputs=model) adam = keras.optimizers.Adam(learning_rate=1e-5) model.compile(loss='binary_crossentropy', optimizer=adam, metrics=['accuracy']) model.summary() return model
def seq2seq(): n_classes = len(LABELS) converse_input = Input(shape=(None, SENTENCE_ENCODING_DIM)) # length_input = Input(shape=(None, 1)) # word_input = Input(shape=(None, WORD_EMBEDDING_DIM)) time_input = Input(shape=(None, 1)) converse = Masking(mask_value=-1.)(converse_input) converse = Dropout(0.2)(converse) converse = Bidirectional(LSTM(1024, return_sequences=True))(converse) converse = Bidirectional(LSTM(1024, return_sequences=True))(converse) converse = Dropout(0.3)(converse) # lengths = Masking(mask_value=-1)(length_input) # words = Masking(mask_value=-1.)(word_input) # words = Dropout(0.2)(words) model = concatenate([converse, time_input], axis=-1) # print("merged outpout shape", model.output_shape) model = TimeDistributed(Dense(1024, activation='relu'))(model) model = Dropout(0.3)(model) model = TimeDistributed(Dense(512, activation='relu'))(model) model = Dropout(0.3)(model) # predictions = TimeDistributed(Dense(n_classes, activation='softmax'))(model) crf = CRF(n_classes, sparse_target=True) predictions = crf(model) model = Model(inputs=[converse_input, time_input], outputs=predictions) model.summary() # model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy']) model.compile('adam', loss=crf.loss_function, metrics=[crf.accuracy]) return model
main_lstm = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.6))(x) #dropout 0.1试试? model = TimeDistributed(Dense(50, activation="relu"))(main_lstm) crf = CRF(n_tags+1) # CRF layer, n_tags+1(PD) out = crf(model) # output # out = Lambda(lambda x: K.reshape(x,(-1,5)))(out) model = Model([word_in, char_in], out) # set optimizer # rmsprop = optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=1e-5) adam = optimizers.Adam(lr=0.01, epsilon=None, decay=1e-1) model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy]) #use crf model.summary() #sample_weight_mode="temporal" tr_pubs = pub_ids[:int(len(pub_ids)*0.9)] val_pubs = pub_ids[int(len(pub_ids)*0.9):] train = subdata_getter(tr_pubs,data) validation = subdata_getter(val_pubs,data) tr_generator = DataGenerator(tr_pubs,train) val_generator = DataGenerator(val_pubs,validation) history = NBatchLogger() model.fit_generator(generator=tr_generator,shuffle=False, epochs=10, verbose=0,callbacks=[history]) #,callbacks=callbacks_list
pos_emb = Embedding(input_dim=len(pos), output_dim=10, input_length=max_len)(pos_input) modified_input = keras.layers.concatenate([word_emb, pos_emb]) model_1 = Bidirectional( LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(modified_input) model = TimeDistributed(Dense(50, activation="relu"))( model_1) # a dense layer as suggested by neuralNer crf = CRF(n_tags) # CRF layer out = crf(model) # output model = Model([input, pos_input], out) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) print(model.summary()) history = model.fit([X_tr, X_pos_tr], np.array(y_tr), batch_size=32, epochs=60, validation_split=0.1, verbose=1) #Testing test_pred = model.predict([X_te, X_pos_te], verbose=1) idx2tag = {i: w for w, i in tag2idx.items()} pred_labels = pred2label(test_pred) test_labels = pred2label(y_te) print("Recall, Precision and F-score are", get_recall_precision(test_labels, pred_labels, "Destination")) model.save("BILSTM+CRF_with_pos_without_embeddings.model")
def test_exist(self, glove, test_data, test_labels): # get word embeddings utils = wordUtils.Utils() if glove: # use glove self.words_list, self.embedding_matrix = utils.load_glove() unword_n = len(self.words_list) else: self.words_list, self.embedding_matrix = utils.load_word2vec() unword_n = len(self.words_list) # get the training corpus cr = corpusreader.CorpusReader(test_data, test_labels) corpus = cr.trainseqs # get the number of the embedding for idx in range(len(corpus)): words = corpus[idx]['tokens'] words_id = [] for i in words: # get the number of the embedding try: # the index of the word in the embedding matrix index = self.words_list.index(i) except ValueError: # use the embedding full of zeros to identify an unknown word index = unword_n # the index of the word in the embedding matrix words_id.append(index) corpus[idx]['embs'] = words_id input = Input(shape=(None,)) el = Embedding(len(self.words_list) + 1, 200, weights=[self.embedding_matrix], trainable=False)(input) bl1 = Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.5, dropout=0.5), merge_mode="concat", name="lstm1")(el) bl2 = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.5, dropout=0.5), merge_mode="concat", name="lstm2")(bl1) bl3 = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.5, dropout=0.5), merge_mode="concat", name="lstm3")(bl2) model = TimeDistributed(Dense(50, activation="relu"))(bl3) # a dense layer as suggested by neuralNer crf = CRF(self.lab_len) # CRF layer out = crf(model) # output model = Model(input, out) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() save_load_utils.load_all_weights(model, 'word_models/words_glove_multiLSTM31.h5') for doc in corpus: doc_arr = doc['embs'] p = model.predict(np.array([doc_arr])) p = np.argmax(p, axis=-1) position = 0 offsets = defaultdict(list) counter = 0 # check if there are any mutations identified # {'O': 0, 'B-E': 1, 'I-E': 2, 'E-E': 3, 'S-E': 4} B = False last = 0 for idx in p[0]: if idx == 1 and last == 1: counter = counter + 1 offsets[counter].append(position) B = True elif idx == 1: B = True offsets[counter].append(position) last = 1 elif idx == 2 and B: offsets[counter].append(position) last = 2 elif idx == 3 and B: offsets[counter].append(position) last = 3 B = False counter = counter + 1 elif idx == 4: offsets[counter].append(position) counter = counter + 1 last = 4 else: B = False position = position + 1 # open file to write textid = str(doc['textid']) abstract = open("words-silver/" + textid + ".a1", 'w') for i in offsets: word = offsets.get(i) size = len(word) if size == 1: s = word[0] # just one; singleton abstract.write(str(doc['tokstart'][s]) + "\t") abstract.write(str(doc['tokend'][s]) + "\t") abstract.write(str(doc['tokens'][s]) + "\n") elif size > 1: s = word[0] # start of token e = word[-1] # end of token abstract.write(str(doc['tokstart'][s]) + "\t") abstract.write(str(doc['tokend'][e]) + "\t") token = "" for c in word: token = token + doc['tokens'][c] abstract.write(str(token) + "\n")
outputs=[conv_model_time_distributed]) conv_model_time_distributed._uses_learning_phase = True #for learning=True, for testing = False #Visualize Model: if flag_plot_model == 1: keras.utils.plot_model(conv_model_single_image_as_model) keras.utils.vis_utils.plot_model(conv_model_single_image_as_model) from IPython.display import SVG from keras.utils.vis_utils import model_to_dot SVG( model_to_dot(conv_model_single_image_as_model).create(prog='dot', format='svg')) #Summarize Model: conv_model_single_image_as_model.summary() conv_model_time_distributed.summary() def clip_shift_layer(predicted_shifts, max_shift=1): # predicted_shifts[(predicted_shifted > max_shift)] = max_shift; return K.clip(predicted_shifts, -max_shift, max_shift) def custom_loss_function(predicted_shifts, true_shifts): #if i predict images max_shift = max_shift_number_global predicted_x = predicted_shifts[0] predicted_y = predicted_shifts[1] true_x = true_shifts[0] true_y = true_shifts[1] difference_clipped = K.clip(K.abs(predicted_shifts - true_shifts),
def model_with_padding(self, DICT, n_char): # get sequences and labels separated. # convert BIO tags to numbers sequences, labels = self.get_seq(DICT) # sequences = sequences[:100] # labels = labels[:100] # X = pad_sequences(sequences, maxlen=self.w_arit_mean, padding='post', truncating='post') # y_pad = pad_sequences(labels, maxlen=self.w_arit_mean, padding='post', truncating='post') X = pad_sequences(sequences, maxlen=self.maxSeqLength, padding='post') y_pad = pad_sequences(labels, maxlen=self.maxSeqLength, padding='post') y = [to_categorical(i, num_classes=self.lab_len) for i in y_pad] # early stopping and best epoch #early_stop = keras.callbacks.EarlyStopping(monitor='loss', patience=2, verbose=0, mode='auto') #filepath = "max-seq.h5" #checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='max') #callbacks_list = [checkpoint, early_stop] # Set up the keras model input = Input(shape=(self.maxSeqLength, )) el = Embedding(n_char + 1, 200, name="embed")(input) bl1 = Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.5, dropout=0.5), merge_mode="concat", name="lstm1")(el) bl2 = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.5, dropout=0.5), merge_mode="concat", name="lstm2")(bl1) bl3 = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.5, dropout=0.5), merge_mode="concat", name="lstm3")(bl2) model = TimeDistributed(Dense(self.lab_len, activation="relu"))(bl3) crf = CRF(self.lab_len) # CRF layer out = crf(model) # output model = Model(input, out) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() #treinar com 32, 147, 245, 735 history = model.fit(X, np.array(y), batch_size=32, epochs=self.epochsN, validation_split=0.0, verbose=1) # save all epochs save_load_utils.save_all_weights(model, 'max_seq_%s_32b.h5' % self.epochsN)
def model_no_padding(self, DICT, n_char): # convert BIO tags to numbers self.convert_tags() ''' check if bion contains 'B' and 'I' for i in self.train_data: print(i['bion']) ''' for i in range(len(self.train_data)): corp = self.train_data[i]['corpus'] corp_num = [] for c in corp: corp_num.append(DICT.get(c)) self.train_data[i]['corpus'] = corp_num # get all sizes from the sequences with training data train_l_d = {} train_l_labels = {} for seq in self.train_data: # corpus l = len(seq['corpus']) if l not in train_l_d: train_l_d[l] = [] train_l_d[l].append(seq['corpus']) # labels l1 = len(seq['bion']) if l1 not in train_l_labels: train_l_labels[l1] = [] train_l_labels[l1].append(seq['bion']) ''' for i in range(len(train_l_d[110])): print(len(train_l_d[110][i]) == len(train_l_labels[110][i])) print() print("\n\n") for i in range(len(train_l_d[31])): print(len(train_l_d[31][i]) == len(train_l_labels[31][i])) print("\n\n") for i in range(len(train_l_d[103])): print(len(train_l_d[103][i]) == len(train_l_labels[103][i])) print("\n\n") exit() ''' sizes = list(train_l_d.keys()) # Set up the keras model il = Input(shape=(None, ), dtype='int32') el = Embedding(n_char + 1, 200, name="embed")(il) bl1 = Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.5, dropout=0.5), merge_mode="concat", name="lstm1")(el) bl2 = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.5, dropout=0.5), merge_mode="concat", name="lstm2")(bl1) bl3 = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.5, dropout=0.5), merge_mode="concat", name="lstm3")(bl2) model = TimeDistributed(Dense(self.num_labs, activation="relu"))(bl3) crf = CRF(self.num_labs) # CRF layer out = crf(model) # output model = Model(il, out) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() f_best = -1 f_index = -1 # OK, start actually training for epoch in range(self.epochsN): print("Epoch", epoch, "start at", datetime.now()) # Train in batches of different sizes - randomize the order of sizes # Except for the first few epochs if epoch > 2: random.shuffle(sizes) for size in sizes: batch = train_l_d[size] labs = train_l_labels[size] tx = np.array([seq for seq in batch]) y = [seq for seq in labs] ty = [to_categorical(i, num_classes=self.num_labs) for i in y] # This trains in mini-batches model.fit(tx, np.array(ty), verbose=0, epochs=1) print("Trained at", datetime.now()) # save all epochs save_load_utils.save_all_weights( model, 'mini-batch-results/epoch_%s.h5' % epoch) # test the results self.test_minibatch(DICT, model) f = self.eval() if f > f_best: f_best = f f_index = epoch # Pick the best model, and save it with a useful name print("Choosing the best epoch") shutil.copyfile("mini-batch-results/epoch_%s.h5" % f_index, "minibatch_%s.h5" % f_index)