def bilstm_character(train_loc, test_loc): train_pre = preprocess(train_loc) test_pre = preprocess(test_loc) cc_train = cuu(train_pre) cc_test = cuu(test_pre) words_all, tags_all = combine_all(cc_train, cc_test) n_tags = len(tags_all) n_words = len(words_all) max_len = 130 max_len_char = 10 word2idx = {w: i + 2 for i, w in enumerate(words_all)} word2idx["UNK"] = 1 word2idx["PAD"] = 0 idx2word = {i: w for w, i in word2idx.items()} tag2idx = {t: i + 1 for i, t in enumerate(tags_all)} tag2idx["PAD"] = 0 idx2tag = {i: w for w, i in tag2idx.items()} X_word = [[word2idx[w[0]] for w in s] for s in cc_train] X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=word2idx["PAD"], padding='post', truncating='post') chars = set([w_i for w in words_all for w_i in w]) n_chars = len(chars) char2idx = {c: i + 2 for i, c in enumerate(chars)} char2idx["UNK"] = 1 char2idx["PAD"] = 0 X_char = x_char(cc_train, max_len, max_len_char, char2idx) y = [[tag2idx[w[1]] for w in s] for s in cc_train] y = pad_sequences(maxlen=max_len, sequences=y, value=tag2idx["PAD"], padding='post', truncating='post') # input and embedding for words word_in = Input(shape=(max_len,)) emb_word = Embedding(input_dim=n_words + 2, output_dim=20, input_length=max_len, mask_zero=True)(word_in) # input and embeddings for characters char_in = Input(shape=(max_len, max_len_char,)) emb_char = TimeDistributed(Embedding(input_dim=n_chars + 2, output_dim=10, input_length=max_len_char, mask_zero=True))(char_in) # character LSTM to get word encodings by characters char_enc = TimeDistributed(LSTM(units=20, return_sequences=False, recurrent_dropout=0.5))(emb_char) # main LSTM x = concatenate([emb_word, char_enc]) x = SpatialDropout1D(0.3)(x) main_lstm = Bidirectional(LSTM(units=50, return_sequences=True, recurrent_dropout=0.6))(x) out = TimeDistributed(Dense(n_tags + 1, activation="softmax"))(main_lstm) model = Model([word_in, char_in], out) model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"]) model.summary() history = model.fit([X_word, np.array(X_char).reshape((len(X_char), max_len, max_len_char))], np.array(y).reshape(len(y), max_len, 1), batch_size=2, epochs=15, validation_split=0.1, verbose=1) y_pred = model.predict([X_word, np.array(X_char).reshape((len(X_char), max_len, max_len_char))]) idx2tag = {i: w for w, i in tag2idx.items()} pred_labels = pred2label(y_pred, idx2tag) true_labels = original(y, idx2tag) f1_train = f1_score(true_labels, pred_labels) precision_train = precision_score(true_labels, pred_labels) recall_train = recall_score(true_labels, pred_labels) train_scores = [f1_train,precision_train,recall_train] print('Training : ') print("F1-score: {:.1%}".format(f1_score(true_labels, pred_labels))) print('Precision-score: {:.1%}'.format(precision_score(true_labels, pred_labels))) print('Recall-score: {:.1%}'.format(recall_score(true_labels, pred_labels))) X_word1 = [[word2idx[w[0]] for w in s] for s in cc_test] X_word1 = pad_sequences(maxlen=max_len, sequences=X_word1, value=word2idx["PAD"], padding='post', truncating='post') X_char1 = x_char(cc_test, max_len, max_len_char, char2idx) y2 = [[tag2idx[w[1]] for w in s] for s in cc_test] y2 = pad_sequences(maxlen=max_len, sequences=y2, value=tag2idx["PAD"], padding='post', truncating='post') y_pred1 = model.predict([X_word1, np.array(X_char1).reshape((len(X_char1), max_len, max_len_char))]) idx2tag = {i: w for w, i in tag2idx.items()} pred_labels1 = pred2label(y_pred1, idx2tag) true_labels1 = original(y2, idx2tag) f1_test = f1_score(true_labels1, pred_labels1) precision_test = precision_score(true_labels1, pred_labels1) recall_test = recall_score(true_labels1, pred_labels1) test_scores = [f1_test,precision_test,recall_test] print('Testing : ') print("F1-score: {:.1%}".format(f1_score(true_labels1, pred_labels1))) print('Precision-score: {:.1%}'.format(precision_score(true_labels1, pred_labels1))) print('Recall-score: {:.1%}'.format(recall_score(true_labels1, pred_labels1))) return train_scores, test_scores
def bilstm_crf(train_loc, test_loc): train_pre = preprocess(train_loc) test_pre = preprocess(test_loc) cc_train = cuu(train_pre) cc_test = cuu(test_pre) words_all, tags_all = combine_all(cc_train, cc_test) n_words = len(words_all) n_tags = len(tags_all) max_len = 130 word2idx = {w: i for i, w in enumerate(words_all)} tag2idx = {t: i for i, t in enumerate(tags_all)} X = [[word2idx[w[0]] for w in s] for s in cc_train] X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1) X1 = [[word2idx[w[0]] for w in s] for s in cc_test] X1 = pad_sequences(maxlen=max_len, sequences=X1, padding="post", value=n_words - 1) y = [[tag2idx[w[1]] for w in s] for s in cc_train] y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"]) y1 = [[tag2idx[w[1]] for w in s] for s in cc_test] y1 = pad_sequences(maxlen=max_len, sequences=y1, padding="post", value=tag2idx["O"]) y = [to_categorical(i, num_classes=n_tags) for i in y] input = Input(shape=(max_len, )) model = Embedding(input_dim=n_words + 1, output_dim=50, input_length=max_len, mask_zero=True)(input) # 20-dim embedding model = Bidirectional( LSTM(units=250, return_sequences=True, recurrent_dropout=0.2))(model) # variational biLSTM model = TimeDistributed(Dense(50, activation="relu"))( model) # a dense layer as suggested by neuralNer crf = CRF(n_tags) # CRF layer out = crf(model) # output model = Model(input, out) model.compile(optimizer="adam", loss=crf.loss_function, metrics=[crf.accuracy]) model.summary() history = model.fit(X, np.array(y), batch_size=4, epochs=15, verbose=1) test_pred = model.predict(X, verbose=1) idx2tag = {i: w for w, i in tag2idx.items()} pred_labels = pred2label(test_pred, idx2tag) true_labels = pred2label(y, idx2tag) f1_train = f1_score(true_labels, pred_labels) precision_train = precision_score(true_labels, pred_labels) recall_train = recall_score(true_labels, pred_labels) train_scores = [f1_train, precision_train, recall_train] y1 = [to_categorical(i, num_classes=n_tags) for i in y1] test_pred1 = model.predict(X1, verbose=1) pred_labels1 = pred2label(test_pred1, idx2tag) true_labels1 = pred2label(y1, idx2tag) f1_test = f1_score(true_labels1, pred_labels1) precision_test = precision_score(true_labels1, pred_labels1) recall_test = recall_score(true_labels1, pred_labels1) test_scores = [f1_test, precision_test, recall_test] print('Testing scores:', test_scores) return test_scores
def bilstm_elmo(train_loc, test_loc): # train_pre = preprocess('/data/xwang/models_origin/convertedBIO/combinedTrain.txt') # test_pre = preprocess('/data/xwang/models_origin/convertedBIO/combinedTrain.txt') train_pre = preprocess(train_loc) test_pre = preprocess(test_loc) cc_train = cuu(train_pre) cc_test = cuu(test_pre) words_all, tags_all = combine_all(cc_train, cc_test) n_tags = len(tags_all) n_words = len(words_all) max_len = 130 tag2idx = {t: i for i, t in enumerate(tags_all)} X = [[w[0] for w in s] for s in cc_train] new_X = [] for seq in X: new_seq = [] for i in range(max_len): try: new_seq.append(seq[i]) except: new_seq.append("__PAD__") new_X.append(new_seq) X = new_X y = [[tag2idx[w[1]] for w in s] for s in cc_train] y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"]) batch_size = 32 sess = tf.Session() K.set_session(sess) elmo_model = hub.Module("/data/xwang/module_elmo2", trainable=True) sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) def ElmoEmbedding(x): return elmo_model(inputs={ "tokens": tf.squeeze(tf.cast(x, tf.string)), "sequence_len": tf.constant(batch_size * [max_len]) }, signature="tokens", as_dict=True)["elmo"] input_text = Input(shape=(max_len, ), dtype=tf.string) embedding = Lambda(ElmoEmbedding, output_shape=(max_len, 1024))(input_text) x = Bidirectional( LSTM(units=512, return_sequences=True, recurrent_dropout=0.2, dropout=0.2))(embedding) x_rnn = Bidirectional( LSTM(units=512, return_sequences=True, recurrent_dropout=0.2, dropout=0.2))(x) x = add([x, x_rnn]) # residual connection to the first biLSTM out = TimeDistributed(Dense(n_tags, activation="softmax"))(x) model = Model(input_text, out) model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]) model.summary() train_num = (len(X) // batch_size) * batch_size print(n_words, n_tags, len(X), len(y), train_num) X_tr = X[:train_num] y_tr = y[:train_num] y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1) history = model.fit(np.array(X_tr), y_tr, batch_size=batch_size, epochs=7, verbose=1) idx2tag = {i: w for w, i in tag2idx.items()} '''test_pred = model.predict(np.array(X_tr), verbose=1) # predicted labels pred_labels = pred2label(test_pred, idx2tag) true_labels = original(y, idx2tag) f1_train = f1_score(true_labels, pred_labels) precision_train = precision_score(true_labels, pred_labels) recall_train = recall_score(true_labels, pred_labels) train_scores = [f1_train, precision_train, recall_train] print('Training:', train_scores)''' X2 = [[w[0] for w in s] for s in cc_test] print(len(X2)) new_X = [] for seq in X2: new_seq = [] for i in range(max_len): try: new_seq.append(seq[i]) except: new_seq.append("__PAD__") new_X.append(new_seq) test_num = (len(new_X) // batch_size) * batch_size print(len(X2), len(new_X), test_num) X2 = new_X[:test_num] y2 = [[tag2idx[w[1]] for w in s] for s in cc_test] y2 = pad_sequences(maxlen=max_len, sequences=y2, padding="post", value=tag2idx["O"]) y2 = y2[:test_num] test_pred1 = model.predict(np.array(X2), verbose=1) pred_labels1 = pred2label(test_pred1, idx2tag) true_labels1 = original(y2, idx2tag) f1_test = f1_score(true_labels1, pred_labels1) precision_test = precision_score(true_labels1, pred_labels1) recall_test = recall_score(true_labels1, pred_labels1) test_scores = [f1_test, recall_test, precision_test] print('Testing:', test_scores) return test_scores