def train(): input = Input(shape=(max_len, )) model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)(input) model = Dropout(0.1)(model) model = Bidirectional( LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model) out = TimeDistributed(Dense(n_tags, activation='softmax'))( model) # softmax output layer model = Model(input, out) model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy']) model.summary() # checkpoint # filepath = "../result/bilstm-weights-{epoch:02d}-{val_acc:.2f}.hdf5" # checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') # history=model.fit(X_train,np.array(y_train),batch_size=32,epochs=5,validation_split=0.1,verbose=1,callbacks=[checkpoint]) history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=5, validation_split=0.1, verbose=1) # 保存模型 model.save(filepath="../result/bi-lstm.h5") hist = pd.DataFrame(history.history) plt.figure(figsize=(12, 12)) plt.plot(hist["acc"]) plt.plot(hist["val_acc"]) plt.show()
def train(self): input = Input(shape=(120,)) model = Embedding(input_dim=self.num_words, output_dim=50, input_length=120)(input) model = Dropout(0.1)(model) model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model) out = TimeDistributed(Dense(self.num_entities, activation="softmax"))(model) model = Model(input, out) model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]) history = model.fit(x=self.X_train, y=np.array(self.Y_train), batch_size=64, epochs=10, validation_data=(self.X_validation, self.Y_validation)) model.save("../models/ner_" + str(datetime.utcnow().microsecond)) test_eval = model.evaluate(self.X_test, self.Y_test, verbose=0) print('Test loss:', test_eval[0]) print('Test accuracy:', test_eval[1]) return model, history
def run(): sentences = read_train_file(TRAIN_PATH) word_map, tag_map = create_word_idx() max_len = max([len(s) for s in sentences]) X = [[word_map[w[0]] for w in s] for s in sentences] n_words = len(word_map) n_tags = 9 X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1) y = [[tag_map[w[2]] for w in s] for s in sentences] y_testing = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag_map["O"]) y = [to_categorical(i, num_classes=n_tags) for i in y_testing] dev_sentences = read_train_file(DEV_PATH) dev_max_len = max([len(s) for s in dev_sentences]) X_dev = [[word_map[w[0]] for w in s] for s in dev_sentences] X_dev = pad_sequences(maxlen=max_len, sequences=X_dev, padding="post", value=n_words - 1) y_dev = [[tag_map[w[2]] for w in s] for s in dev_sentences] y_dev = pad_sequences(maxlen=max_len, sequences=y_dev, padding="post", value=tag_map["O"]) y_dev = [to_categorical(i, num_classes=9) for i in y_dev] test_sentences = read_train_file(TEST_PATH) test_max_len = 33 X_test = [[word_map[w[0]] for w in s] for s in test_sentences] X_test = pad_sequences(maxlen=test_max_len, sequences=X_test, padding="post", value=n_words - 1) y_test = [[tag_map[w[2]] for w in s] for s in test_sentences] y_test = pad_sequences(maxlen=test_max_len, sequences=y_test, padding="post", value=tag_map["O"]) input = Input(shape=(max_len,)) model = Embedding(input_dim=n_words + 1, output_dim=20, input_length=max_len)(input) model = Dropout(0.1)(model) model = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(model) model = TimeDistributed(Dense(n_tags, activation="relu"))(model) # softmax output layer crf = CRF(n_tags) # CRF layer out = crf(model) # output model = Model(input, out) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) history = model.fit(X, np.array(y), batch_size=32, epochs=5, verbose=1) model.save('simple_model.h5') return model, tag_map, X_test, y_test
def train(): input = Input(shape=(input_max_len, )) model = Embedding(vocab_size, embedding_size, weights=[glove_embedding_matrix()], input_length=input_max_len, trainable=False)(input) model = Bidirectional( LSTM(embedding_size, dropout=dropout, recurrent_dropout=recurrent_dropout, return_sequences=True))(model) model = Bidirectional( LSTM(2 * embedding_size, dropout=dropout, recurrent_dropout=recurrent_dropout, return_sequences=True))(model) model = TimeDistributed(Dense(embedding_size, activation='sigmoid'))(model) model = Flatten()(model) model = Dense(input_max_len, activation='sigmoid')(model) out = model model = Model(input, out) model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) print(model.summary()) history = model.fit(padded_features, np.array(final_label_updated), validation_split=validation_split, epochs=epochs, batch_size=batch_size, verbose=logging, shuffle=True) model.save(model_name) metrics(history, model)
metrics=["accuracy"]) model.summary() history = model.fit(X_train, np.array(y_train), batch_size=32, epochs=1, validation_split=0.1, verbose=1, callbacks=callbacks) loss, accuracy = model.evaluate(X_test, np.array(y_test)) # save model print('saved model to ', args.output_model_path) model.save(MODEL_FILE) with file_io.FileIO(MODEL_FILE, mode='rb') as input_f: with file_io.FileIO(args.output_model_path + '/' + MODEL_FILE, mode='wb+') as output_f: output_f.write(input_f.read()) # write out metrics metrics = { 'metrics': [{ 'name': 'accuracy-score', 'numberValue': accuracy, 'format': "PERCENTAGE", }] } with file_io.FileIO('/mlpipeline-metrics.json', 'w') as f:
test_sentence = tokenize(sentence) # Tokenization # Preprocessing x_test_sent = pad_sequences( sequences=[[word2idx.get(w, 0) for w in test_sentence]], padding="post", value=word2idx["PAD"], maxlen=MAX_LEN) # Evaluation p = model.predict(np.array([x_test_sent[0]])) p = np.argmax(p, axis=-1) # Visualization print("{:15}||{}".format("Word", "Prediction")) print(30 * "=") for w, pred in zip(test_sentence, p[0]): print("{:15}: {:5}".format(w, idx2tag[pred])) interact_manual( get_prediction, sentence=widgets.Textarea(placeholder='Next Monday is Christmas!')) # Saving Vocab with open('/path/to/save/word_to_index.pickle', 'wb') as handle: pickle.dump(word2idx, handle, protocol=pickle.HIGHEST_PROTOCOL) # Saving Vocab with open('/path/to/save/tag_to_index.pickle', 'wb') as handle: pickle.dump(tag2idx, handle, protocol=pickle.HIGHEST_PROTOCOL) model.save('/path/to/save/lstm_crf_weights')
monitor = EarlyStopping(monitor = "val_acc", min_delta = 0.0001, patience = 3, verbose = 1, mode = "max"); board = TensorBoard(log_dir = "log/{}".format(arguments.id)); model.fit(([inputs, cues] if arguments.cues else inputs), outputs, validation_split = arguments.vs, batch_size = arguments.bs, epochs = arguments.epochs, callbacks = [monitor, board], verbose = 1); if arguments.debug: print("model.evaluate() on training: {}" "".format(model.evaluate(([inputs, cues] if arguments.cues else inputs), outputs, verbose = 1))); model.save(arguments.id + ".h5"); # # in a few, rare circumstances, we allow ourselves to re-interpret variable # names, as is the case of .inputs. and .outputs. here: now turning our focus # to the evaluation data. # n = 0; unknown = 0; inputs = np.zeros((len(test), LENGTH), dtype = int); cues = np.zeros((len(test), LENGTH), dtype = int); golds = np.zeros((len(test), LENGTH, len(classes) - (2 if arguments.cues else 0)), dtype = int); for i, sentence in enumerate(test): n += len(sentence["nodes"]);
X = X.reshape((200000, 75)) print("Reshaping emb...") embedding_matrix = embedding_matrix.reshape((VOCAB_DIM, EMBEDDING_DIM)) print("Reshaping Y...") Y = Y.reshape((200000, max_len, n_tags)) input = Input(shape=(max_len, )) model = Embedding(input_dim=VOCAB_DIM, output_dim=EMBEDDING_DIM, input_length=max_len, weights=[embedding_matrix], mask_zero=True, trainable=True)(input) # 100-dim embedding model = Bidirectional( LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model) out = Dense(n_tags, activation="softmax")(model) model = Model(input, out) model.compile(loss='categorical_crossentropy', optimizer='nadam', metrics=['accuracy']) model.summary() # Fit model.fit(X, Y, batch_size=512, epochs=4, validation_split=0.1, verbose=1) # Save model.save('./models/model_lstm_100.h5')
crf = CRF(len(labels)) # CRF layer out = crf(model) # output model = Model(input, out) if not os.path.isfile(model_name): model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=20, validation_split=0.1, verbose=1) model.save(model_name) else: custom_objects = { 'CRF': CRF, 'crf_loss': crf_loss, 'crf_viterbi_accuracy': crf_viterbi_accuracy } model = load_model(model_name, custom_objects=custom_objects) # plot_model(model, to_file='lstm_crf.png') # Evaluation y_pred = model.predict(X_te) y_pred = np.argmax(y_pred, axis=-1) y_test_act = np.argmax(y_te, axis=-1)
dataset_size = train_data.shape[0] batches_per_epoch = dataset_size / batch_size lr_decay = (1. / (1 / 32) - 1) / batches_per_epoch model.compile(optimizer=Adam(lr=0.016, decay=0.001), loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() history = model.fit(X_tr, np.array(y_tr), batch_size=batch_size, epochs=ephochs, validation_data=(X_v, np.array(y_v)), verbose=1) ### Save Model model.save(root_path + '/models/lstm/w2v_bilstm_crf_1.h5') from keras.models import load_model # model = load_model(root_path + '/models/lstm/w2v_bilstm_crf.h5') # history is a dictionary,keys are val_loss,val_acc,loss,acc hist = pd.DataFrame(history.history) fig = plt.figure(figsize=(12, 12)) # add subplots sub_fig1 = fig.add_subplot(1, 2, 1) # 1 row 2 cols 1st figure sub_fig2 = fig.add_subplot(1, 2, 2) # set titles sub_fig1.set_title('Accuracy') sub_fig2.set_title('Loss') print(hist) # set values and labels
#split dataset X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.1) #building the layers of the neural network input = Input(shape=(max_len,)) model = Embedding(input_dim=n_words, output_dim=50, input_length=max_len)(input) model = Dropout(0.1)(model) model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model) out = TimeDistributed(Dense(n_tags, activation="softmax"))(model) # softmax output layer model = Model(input, out) model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]) history = model.fit(X_tr, np.array(y_tr), batch_size=32, epochs=5, validation_split=0.1, verbose=1) filename = 'ner.sav' model.save(filename) #hist = pd.DataFrame(history.history) # #plt.figure(figsize=(12,12)) #plt.plot(hist["acc"]) #plt.plot(hist["val_acc"]) #plt.show() # ##testing some predictions ##use this model to post a new sentence i = 2318 p = model.predict(np.array([X_te[i]])) p = np.argmax(p, axis=-1) print("{:15} ({:5}): {}".format("Word", "True", "Pred"))
import pickle import numpy as np from sklearn.model_selection import train_test_split from keras.models import Model, Input from keras.layers import LSTM, Embedding, Dense, TimeDistributed, Dropout, Bidirectional with open ('ci_files/x_set', 'rb') as fp: X = pickle.load(fp) with open ('ci_files/y_set', 'rb') as fp: y = pickle.load(fp) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) input_layer = Input(shape=(56,)) model = Embedding(input_dim=26302, output_dim=56, input_length=56)(input_layer) model = Dropout(0.1)(model) model = Bidirectional(LSTM(units=100, return_sequences=True, recurrent_dropout=0.1))(model) out = TimeDistributed(Dense(17, activation="softmax"))(model) model = Model(input_layer, out) model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"]) model.fit(X_train, np.array(y_train), batch_size=32, epochs=1, validation_split=0.2, verbose=1) model.save("model.pt")
def train_eval(data_path, model_name, option='simple', emb_path=None): """Train a model with the data in path. Save it (and the formatting) as model_name. If option is 'emb', emb_path is the path to the embedding to be used. """ # get the data try: X_train, y_train = get_data(data_path + '/train') X_val, y_val = get_data(data_path + '/val') X_test, y_test = get_data(data_path + '/test') except: raise Exception("Some data file does not exist") # preprocess the texts for X in [X_train, X_val, X_test]: preprocess_text(X) # Keras needs the sequences to be numerical and padded, as well as the labels # We will need all the words and labels for this words = list(set([w for sent in X_train + X_val + X_test for w in sent])) labels = list(set([l for sent in y_train for l in sent])) words.append('--PAD--') # labels.append('--PAD--') n_labels = len(labels) n_words = len(words) words2num = {word: i for i, word in enumerate(words)} labels2num = {label: i for i, label in enumerate(labels)} # a trick for NER... if 'O' in labels2num: labels2num['--PAD--'] = labels2num['O'] else: labels2num['--PAD--'] = enumerate(labels) + 1 [X_train_num, X_val_num, X_test_num ] = [process_sequences(X, words2num) for X in [X_train, X_val, X_test]] [y_train_num, y_val_num, y_test_num ] = [process_sequences(y, labels2num) for y in [y_train, y_val, y_test]] [y_train_num, y_val_num, y_test_num] = [[to_categorical(i, num_classes=n_labels) for i in y] for y in [y_train_num, y_val_num, y_test_num]] if option == 'emb': try: emb_dict = KeyedVectors.load(emb_path) except: raise Exception("Embedding file does not exist") emb_matrix = np.zeros((len(words), emb_dict.vector_size)) for i, w in enumerate(words): # Build a matrix for the indexes with the vector values of corresponding words # If the word does not exist in the embedding, keep zeros if w in emb_dict: emb_matrix[i] = emb_dict[w] # We build a Bidirectional LSTM input = Input(shape=(None, )) if option == 'emb': model = Embedding(input_dim=n_words, output_dim=emb_dict.vector_size, weights=[emb_matrix])(input) else: model = Embedding(input_dim=n_words, output_dim=50)(input) model = Dropout(0.1)(model) model = Bidirectional( LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(model) out = TimeDistributed(Dense(n_labels, activation="softmax"))( model) # TimeDistributed keeps the outputs for each sequence separated # crf = CRF(n_labels) # CRF layer # out = crf(model) model = Model(input, out) if option == 'crf': crf = CRF(n_labels) # CRF layer out = crf(model) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) else: model.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]) # Fit the model using the validation data model.fit(X_train_num, np.array(y_train_num), batch_size=32, epochs=5, validation_data=(X_val_num, np.array(y_val_num)), verbose=1) # Save the model model.save('{}.hdf5'.format(model_name), overwrite=True) formatter = { 'labels': labels, 'words': 'words', 'words2num': words2num, 'labels2num': labels2num } with open('{}-preproc.json'.format(model_name), 'w+') as f: json.dump(formatter, f) # Evaluate the model on the test data predictions = model.predict(X_test_num) results = model.evaluate(X_test_num, np.array(y_test_num)) print("Overall results for the predictions: {}".format(results)) # This values are not very clear because of class imbalance # Make a better evaluation predictions = np.argmax(predictions, axis=-1) predictions = [[labels[i] for i in pred] for pred in predictions] evaluate(y_test, predictions, labels) return (predictions)
plt.plot(epochs, loss, 'bo', label='Training loss') plt.plot(epochs, val_loss, 'b', label='Validation loss') plt.title('Training and validation loss') plt.legend() plt.show() word_to_index_path = sys.argv[4] + '.pickle' with open(word_to_index_path, 'wb') as file: pickle.dump(word_to_index, file) tag_to_index_path = sys.argv[5] + '.pickle' with open(tag_to_index_path, 'wb') as file: pickle.dump(tag_to_index, file) path = sys.argv[3] + '.sav' model.save(path) # Evaluation y_pred = model.predict(X_test) y_pred = np.argmax(y_pred, axis=-1) y_test_true = np.argmax(y_test, -1) # Convert the index to tag y_pred = [[idx2tag[i] for i in row] for row in y_pred] y_test_true = [[idx2tag[i] for i in row] for row in y_test_true] print("F1-score is : {:.1%}".format(f1_score(y_test_true, y_pred))) report = flat_classification_report(y_pred=y_pred, y_true=y_test_true, labels=tags) print(report)
##Training model model.compile(optimizer='adam', loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy']) model.summary() # filepath="Model Version/ner_{val_accuracy:.2f}.hdf5" # checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max') # callbacks_list = [checkpoint] # history = model.fit(X_train, np.array(y_train), batch_size=256, epochs=3, validation_split=0.05, verbose=1, callbacks=callbacks_list) history = model.fit(X_train, np.array(y_train), batch_size=512, epochs=3, validation_split=0.05, verbose=1) model.save("Model Version/ner_kw.hdf5") plot_history(history) # #Loading model # model = k.models.load_model("Model Version/ner_kw.hdf5", custom_objects={'CRF': crf, 'crf_loss': crf.loss_function, 'crf_viterbi_accuracy': crf.accuracy}) # print("Loaded model from disk") # model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy, 'accuracy']) i = 0 pred_sing = model.predict(np.array([X_test[i]])) pred_sing = np.argmax(pred_sing, axis=-1) gt = np.argmax(y_test[i], axis=-1) print('\ngt', gt) print("\n{:14}: ({:5}): {}".format("Word", "True", "Pred")) for idx, (w, pred) in enumerate(zip(X_test[i], pred_sing[0])): print("{:14}: ({:5}): {}".format(words[w], idx2tag[gt[idx]], tags[pred]))
# Save the model (weights) # save_all_weights | load_all_weights: saves model and optimizer weights (save_weights and save) model.save_weights("pretrained_models\\fulltext_model_weights.h5" ) # sentences_model_weights.h5 ''' # `assert_consumed` can be used as validation that all variable values have been restored from the checkpoint. # See `tf.train.Checkpoint.restore` for other methods in the Status object. print(load_status.assert_consumed()) # Check that all of the pretrained weights have been loaded. for a, b in zip(pretrained.weights, model.weights): np.testing.assert_allclose(a.numpy(), b.numpy()) ''' # Save the model (architecture, loss, metrics, optimizer state, weights) model.save('pretrained_models\\fulltext_bi_lstm_crf_dense_linear.h5' ) # sentences_bi_lstm_crf_dense_linear.h5 ''' # Load the model from keras.models import load_model model = load_model('pretrained_models\\fulltext_bi_lstm_crf_dense_linear.h5', custom_objects={'CRF': CRF(number_labels), 'num_classes': number_labels}) # , 'loss': crf.loss, 'metrics': [crf.accuracy] ''' # ====================================================================================================================== # Count the total running time # ====================================================================================================================== total_time = str(timedelta(seconds=(time.time() - start_time))) print("\n--- %s running time ---" % total_time) # ====================================================================================================================== # Track model loss per epoch