word_seq.append(char2idx.get("PAD")) sent_seq.append(word_seq) X_char_te.append(np.array(sent_seq)) # # true y y = [[tag2idx[w[2]] for w in s] for s in each_sentences] # # Padding each sentence to have the same lenght y = pad_sequences(maxlen=MAX_LEN, sequences=y, value=tag2idx["PAD"], padding='post', truncating='post') # # One-Hot encode y_te = [to_categorical(i, num_classes=n_tags + 1) for i in y] # n_tags+1(PAD) # # Eval pred_cat = model.predict([ X_word_te, np.array(X_char_te).reshape((len(X_char_te), MAX_LEN, max_len_char)) ]) pred = np.argmax(pred_cat, axis=-1) y_te_true = np.argmax(y_te, -1) # # Convert the index to tag pred_tag = [[idx2tag[i] for i in row] for row in pred] y_te_true_tag = [[idx2tag[i] for i in row] for row in y_te_true] report = flat_classification_report(y_pred=pred_tag, y_true=y_te_true_tag) print(report)
pos_emb = Embedding(input_dim=len(pos), output_dim=10, input_length=max_len)(pos_input) modified_input = keras.layers.concatenate([word_emb, pos_emb]) model_1 = Bidirectional( LSTM(units=50, return_sequences=True, recurrent_dropout=0.1))(modified_input) model = TimeDistributed(Dense(50, activation="relu"))( model_1) # a dense layer as suggested by neuralNer crf = CRF(n_tags) # CRF layer out = crf(model) # output model = Model([input, pos_input], out) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) print(model.summary()) history = model.fit([X_tr, X_pos_tr], np.array(y_tr), batch_size=32, epochs=60, validation_split=0.1, verbose=1) #Testing test_pred = model.predict([X_te, X_pos_te], verbose=1) idx2tag = {i: w for w, i in tag2idx.items()} pred_labels = pred2label(test_pred) test_labels = pred2label(y_te) print("Recall, Precision and F-score are", get_recall_precision(test_labels, pred_labels, "Destination")) model.save("BILSTM+CRF_with_pos_without_embeddings.model")
def test_exist(self, glove, test_data, test_labels): # get word embeddings utils = wordUtils.Utils() if glove: # use glove self.words_list, self.embedding_matrix = utils.load_glove() unword_n = len(self.words_list) else: self.words_list, self.embedding_matrix = utils.load_word2vec() unword_n = len(self.words_list) # get the training corpus cr = corpusreader.CorpusReader(test_data, test_labels) corpus = cr.trainseqs # get the number of the embedding for idx in range(len(corpus)): words = corpus[idx]['tokens'] words_id = [] for i in words: # get the number of the embedding try: # the index of the word in the embedding matrix index = self.words_list.index(i) except ValueError: # use the embedding full of zeros to identify an unknown word index = unword_n # the index of the word in the embedding matrix words_id.append(index) corpus[idx]['embs'] = words_id input = Input(shape=(None,)) el = Embedding(len(self.words_list) + 1, 200, weights=[self.embedding_matrix], trainable=False)(input) bl1 = Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.5, dropout=0.5), merge_mode="concat", name="lstm1")(el) bl2 = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.5, dropout=0.5), merge_mode="concat", name="lstm2")(bl1) bl3 = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.5, dropout=0.5), merge_mode="concat", name="lstm3")(bl2) model = TimeDistributed(Dense(50, activation="relu"))(bl3) # a dense layer as suggested by neuralNer crf = CRF(self.lab_len) # CRF layer out = crf(model) # output model = Model(input, out) model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() save_load_utils.load_all_weights(model, 'word_models/words_glove_multiLSTM31.h5') for doc in corpus: doc_arr = doc['embs'] p = model.predict(np.array([doc_arr])) p = np.argmax(p, axis=-1) position = 0 offsets = defaultdict(list) counter = 0 # check if there are any mutations identified # {'O': 0, 'B-E': 1, 'I-E': 2, 'E-E': 3, 'S-E': 4} B = False last = 0 for idx in p[0]: if idx == 1 and last == 1: counter = counter + 1 offsets[counter].append(position) B = True elif idx == 1: B = True offsets[counter].append(position) last = 1 elif idx == 2 and B: offsets[counter].append(position) last = 2 elif idx == 3 and B: offsets[counter].append(position) last = 3 B = False counter = counter + 1 elif idx == 4: offsets[counter].append(position) counter = counter + 1 last = 4 else: B = False position = position + 1 # open file to write textid = str(doc['textid']) abstract = open("words-silver/" + textid + ".a1", 'w') for i in offsets: word = offsets.get(i) size = len(word) if size == 1: s = word[0] # just one; singleton abstract.write(str(doc['tokstart'][s]) + "\t") abstract.write(str(doc['tokend'][s]) + "\t") abstract.write(str(doc['tokens'][s]) + "\n") elif size > 1: s = word[0] # start of token e = word[-1] # end of token abstract.write(str(doc['tokstart'][s]) + "\t") abstract.write(str(doc['tokend'][e]) + "\t") token = "" for c in word: token = token + doc['tokens'][c] abstract.write(str(token) + "\n")