def fit_lstm_model(X_train, y_train, n_words, n_tags, seq_len, class_weights, epochs): '''Set up LSTM model with one input - equal length sequences of encoded text''' input_seq = Input(shape=(seq_len, )) '''Pass the GloVe pretrained model weights into the embedding layer''' embedding = Embedding(input_dim=n_words, output_dim=300, weights=[embedding_matrix], trainable=True)(input_seq) embedding = Dropout(0.1)(embedding) '''Add Bidirectional LSTM layer, dense hidden layer, and final output layer''' model = Bidirectional( LSTM(units=64, return_sequences=True, recurrent_dropout=0.1))(embedding) model = TimeDistributed(Dense(64, activation='relu'))(model) output = Dense(n_tags, activation="softmax")(model) '''Compile and fit deep neural network''' model = Model(inputs=input_seq, outputs=output) model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]) history = model.fit(X_train, y_train, epochs=epochs, batch_size=32, validation_split=0.1, verbose=1, class_weight=[class_weights]) '''Create simple performance report for the model''' val_loss, val_acc = model.evaluate(X_test, y_test) print(f'Model validation loss was {val_loss}') print(f'Model validation accuracy was {val_acc}') return model, history
def train(self, epochs, embedding=None): # Embedded Words txt_input = Input(shape=(None, ), name='txt_input') txt_embed = Embedding(input_dim=self.num_words, output_dim=MAX_LEN, input_length=None, name='txt_embedding', trainable=False, weights=([embedding]))(txt_input) txt_drpot = Dropout(0.1, name='txt_dropout')(txt_embed) # Embedded Part of Speech pos_input = Input(shape=(None, ), name='pos_input') pos_embed = Embedding(input_dim=self.num_pos, output_dim=MAX_LEN, input_length=None, name='pos_embedding')(pos_input) pos_drpot = Dropout(0.1, name='pos_dropout')(pos_embed) # Embedded Characters char_in = Input(shape=( None, MAX_LEN_CHAR, ), name="char_input") emb_char = TimeDistributed( Embedding(input_dim=self.num_chars, output_dim=MAX_LEN_CHAR, input_length=None))(char_in) char_enc = TimeDistributed( LSTM(units=20, return_sequences=False, recurrent_dropout=0.5))(emb_char) # Concatenate inputs x = concatenate([txt_drpot, pos_drpot, char_enc], axis=2) x = SpatialDropout1D(0.3)(x) # Deep Layers model = Bidirectional( LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(x) model = Bidirectional( LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model) # Output out = TimeDistributed(Dense(self.num_entities, activation="softmax"))(model) model = Model(inputs=[txt_input, pos_input, char_in], outputs=[out]) model.compile(optimizer="rmsprop", loss='categorical_crossentropy', metrics=['accuracy']) plot_model(model, to_file=self.save_path + 'model_structure.png') print(model.summary()) history = model.fit( [self.X_train, self.train_pos, self.train_characters], np.array(self.Y_train), batch_size=32, epochs=epochs, validation_data=([ self.X_validation, self.valid_pos, self.valid_characters ], np.array(self.Y_validation)), verbose=1) model.save(self.save_path + 'model_ner') test_eval = model.evaluate( [self.X_test, self.test_pos, self.test_characters], np.array(self.Y_test)) print('Test loss:', test_eval[0]) print('Test accuracy:', test_eval[1]) return model, history
def bilstm(X_train, X_test, Y_train, Y_test, wordembeddings): np.random.seed(1234) tf.random.set_seed(1234) random.seed(1234) max_length_sentence = X_train.str.split().str.len().max() tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True) tokenizer.fit_on_texts(X_train) word_index = tokenizer.word_index EMBEDDING_DIM = 300 vocabulary_size = len(word_index) + 1 print('Found %s unique tokens.' % len(word_index)) sequences_train = tokenizer.texts_to_sequences(X_train) sequences_valid = tokenizer.texts_to_sequences(X_test) X_train = pad_sequences(sequences_train, maxlen=max_length_sentence) X_val = pad_sequences(sequences_valid, maxlen=X_train.shape[1]) y_train = np.asarray(Y_train) y_val = np.asarray(Y_test) #print(word_index) ''' print('Shape of data tensor:', X_train.shape) print('Shape of data tensor:', X_val.shape) print('Shape of data tensor:', y_train.shape) print('Shape of data tensor:', y_val.shape) print(X_train) print("*"*100) print(X_val) print("*"*100) print(y_train) print("*"*100) print(y_val) ''' embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM)) for word, i in word_index.items(): if (word in wordembeddings.keys()): embedding_vector = wordembeddings[word] if len(embedding_vector) == 0: #if array is empty embedding_vector = wordembeddings[word.title()] if len(embedding_vector) == 0: embedding_vector = wordembeddings[word.upper()] if len(embedding_vector) == 0: embedding_vector = np.array([ round(np.random.rand(), 8) for i in range(0, 300) ]) else: #print("WORD NOT IN DICT",word) embedding_vector = np.array( [round(np.random.rand(), 8) for i in range(0, 300)]) if len(embedding_vector) != 0: embedding_matrix[i] = embedding_vector embedding_layer = Embedding(vocabulary_size, EMBEDDING_DIM, weights=[embedding_matrix], trainable=False) #Try with True inputs = Input(shape=(X_train.shape[1], )) model = (Embedding(vocabulary_size, EMBEDDING_DIM, input_length=max_length_sentence, weights=[embedding_matrix]))(inputs) model = Bidirectional(GRU(64))( model) # !!!!!!! CHANGE THIS FOR OTHER MODELS model = (Dense(900, activation='relu'))(model) model = (Dense(400, activation='relu'))(model) model = (Dense(250, activation='relu'))(model) model = (Dense(204, activation='softmax'))(model) model = Model(inputs=inputs, outputs=model) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) model.summary() callbacks = [EarlyStopping(monitor='val_loss')] hist_adam = model.fit( X_train, y_train, batch_size=1000, epochs=200, verbose=1, validation_data=(X_val, y_val), callbacks=callbacks ) #!!!!!!!!!!!!!!!!!!!!!!!CHANGE BATCH SIZE TO 1000 #change epochs to 200 model.save(config.bigru_prepocessed_dataset1_chai ) # !!!!!!! CHANGE THIS FOR OTHER MODELS y_pred = model.predict(X_val) print(y_pred) y_val_class = pd.DataFrame(y_val).idxmax(axis=1) print(y_val_class) y_val_class_argmax = np.argmax(y_val, axis=1) y_pred_class_argmax = np.argmax(y_pred, axis=1) y_pred_class = pd.DataFrame(y_pred).idxmax(axis=1) print(y_pred_class) print(classification_report(y_val_class, y_pred_class)) plt.suptitle('Optimizer : Adam', fontsize=10) plt.ylabel('Loss', fontsize=16) plt.xlabel('Epoch', fontsize=14) plt.plot(hist_adam.history['loss'], color='b', label='Training Loss') plt.plot(hist_adam.history['val_loss'], color='r', label='Validation Loss') plt.legend(loc='upper right') plt.savefig( '/home/ubuntu/asset_classification/results/bigru_model_dataset1_preprocessed_chai.png' ) # !!!!!!! CHANGE THIS FOR OTHER MODELS tf.keras.utils.plot_model( model, to_file=config.bigru_architecture, show_shapes=True) # !!!!!!! CHANGE THIS FOR OTHER MODELS return (y_pred, y_val_class, y_pred_class, y_val_class_argmax, y_pred_class_argmax)
def main(self, glove): # get word embeddings utils = wordUtils.Utils() if glove: # use glove self.words_list, self.embedding_matrix = utils.load_glove() unword_n = len(self.words_list) else: self.words_list, self.embedding_matrix = utils.load_word2vec() unword_n = len(self.words_list) # get the training corpus cr = corpusreader.CorpusReader(self.textfile, self.annotfile) corpus = cr.trainseqs print(len(corpus)) train = [] print("Processing training data", datetime.now()) for doc in corpus: tmp_dic = {} tmp_dic['tokens'] = doc['tokens'] # convert SOBIE tags to numbers tags = doc['bio'] tags = [self.lablist[i] for i in tags] tmp_dic['bion'] = tags train.append(tmp_dic) n_emb = 0 n_unk = 0 # get the number of the embedding for idx in range(len(train)): words = train[idx]['tokens'] words_id = [] for i in words: # get the number of the embedding try: # the index of the word in the embedding matrix index = self.words_list.index(i) n_emb = n_emb + 1 except ValueError: # use the embedding full of zeros to identify an unknown word n_unk = n_unk + 1 index = unword_n # the index of the word in the embedding matrix words_id.append(index) train[idx]['tokens'] = words_id # get all sizes from the sequences with training data train_l_d = {} train_l_labels = {} for seq in train: # corpus l = len(seq['tokens']) if l not in train_l_d: train_l_d[l] = [] train_l_d[l].append(seq['tokens']) # labels l1 = len(seq['bion']) if l1 not in train_l_labels: train_l_labels[l1] = [] train_l_labels[l1].append(seq['bion']) sizes = list(train_l_d.keys()) for i in sizes: if len(train_l_d[i]) != len(train_l_labels[i]): print("merda") for m in range(len(train_l_d[i])): if len(train_l_d[i][m]) != len(train_l_labels[i][m]): print("XXX") input = Input(shape=(None,)) el = Embedding(len(self.words_list) + 1, 200, weights=[self.embedding_matrix], trainable=False)(input) model = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout = 0.1))(el) # variational biLSTM model = TimeDistributed(Dense(50, activation="relu"))(model) # a dense layer as suggested by neuralNer crf = CRF(self.lab_len) # CRF layer out = crf(model) # output model = Model(input, out) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() f_best = -1 f_index = -1 # OK, start actually training for epoch in range(self.epochsN): print("Epoch", epoch, "start at", datetime.now()) # Train in batches of different sizes - randomize the order of sizes # Except for the first few epochs if epoch > 2: random.shuffle(sizes) for size in sizes: batch = train_l_d[size] labs = train_l_labels[size] tx = np.array([seq for seq in batch]) y = [seq for seq in labs] ty = [to_categorical(i, num_classes=self.lab_len) for i in y] # This trains in mini-batches model.fit(tx, np.array(ty), verbose=0, epochs=1) print("Trained at", datetime.now()) # save all epochs save_load_utils.save_all_weights(model, 'words-results/epoch_%s.h5' % epoch) # test the results test_data = 'corpus_char/tmVarCorpus/treated/test_data.txt' test_labels = 'corpus_char/tmVarCorpus/treated/test_labels.tsv' self.test_model(test_data, test_labels, model, glove) f = self.eval() if f > f_best: f_best = f f_index = epoch # Pick the best model, and save it with a useful name print("Choosing the best epoch") shutil.copyfile("words-results/epoch_%s.h5" % f_index, "words_glove_%s.h5" % f_index)
LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(input) model = TimeDistributed(Dense(50, activation="relu"))(model) crf = CRF_2nd(len(data.tag_to_index)) out_layer = crf(model) model = Model(input, out_layer) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() BATCH_SIZE = 64 EPOCHS = 10 history = model.fit(X_train, np.array(y_train), batch_size=BATCH_SIZE, epochs=EPOCHS, validation_split=0.1, verbose=2) def pred2label(pred): out = [] for pred_i in pred: out_i = [] for p in pred_i: p_i = np.argmax(p) out_i.append(idx2tag[p_i].replace("PADword", "O")) out.append(out_i) return out
# learning rate decay dataset_size = train_data.shape[0] batches_per_epoch = dataset_size/batch_size lr_decay = (1./(1/32) -1)/batches_per_epoch model.compile( optimizer=Adam(lr=0.012, decay=lr_decay), loss=crf.loss_function, metrics=[crf.accuracy] ) model.summary() from keras.utils.vis_utils import plot_model history = model.fit([X_w_tr,np.array(X_c_tr).reshape((len(X_c_tr), max_len, max_len_char))], np.array(y_tr), batch_size=batch_size, epochs=ephochs, validation_data = ([X_w_v,np.array(X_c_v).reshape((len(X_c_v), max_len, max_len_char))], np.array(y_v)), verbose=1, ) # # history is a dictionary,keys are val_loss,val_acc,loss,acc hist = pd.DataFrame(history.history) fig = plt.figure(figsize=(12,12)) # add subplots sub_fig1 = fig.add_subplot(1,2,1) # 1 row 2 cols 1st figure sub_fig2 = fig.add_subplot(1,2,2) # set titles sub_fig1.set_title('Accuracy') sub_fig2.set_title('Loss') print(hist) # set values and labels
Y = Y.reshape((Y.shape[0], Y.shape[1], 1)) all_train = all[:int(0.8 * all.shape[0]), ...] Y_train = Y[:int(0.8 * all.shape[0]), ...] Y_train_dense = np.reshape(Y_train, (Y_train.shape[0], Y_train.shape[1])) Y_train_dense = np.argmax(Y_train_dense, axis=-1) all_test = all[int(0.8 * all.shape[0]):, ...] Y_test = Y[int(0.8 * all.shape[0]):, ...] # pu.db Y_test_dense = np.reshape(Y_test, (Y_test.shape[0], Y_test.shape[1])) Y_test_dense = np.argmax(Y_test_dense, axis=-1) for i in xrange(100): print i model.fit(all_train, Y_train, batch_size=1000, epochs=5, verbose=1) Y_pred_train = model.predict(all_train, batch_size=1000) Y_pred_test = model.predict(all_test, batch_size=1000) Y_pred_train_dense = np.reshape( Y_pred_train, (Y_pred_train.shape[0], Y_pred_train.shape[1])) Y_pred_train_dense = np.argmax(Y_pred_train_dense, axis=-1) Y_pred_test_dense = np.reshape( Y_pred_test, (Y_pred_test.shape[0], Y_pred_test.shape[1])) Y_pred_test_dense = np.argmax(Y_pred_test_dense, axis=-1) train_acc = np.sum( Y_pred_train_dense == Y_train_dense) * 100.0 / len(Y_pred_train_dense) val_acc = np.sum(