예제 #1
0
                word_seq.append(char2idx.get("PAD"))
        sent_seq.append(word_seq)
    X_char_te.append(np.array(sent_seq))

# # true y
y = [[tag2idx[w[2]] for w in s] for s in each_sentences]
# # Padding each sentence to have the same lenght
y = pad_sequences(maxlen=MAX_LEN,
                  sequences=y,
                  value=tag2idx["PAD"],
                  padding='post',
                  truncating='post')

# # One-Hot encode
y_te = [to_categorical(i, num_classes=n_tags + 1) for i in y]  # n_tags+1(PAD)

# # Eval
pred_cat = model.predict([
    X_word_te,
    np.array(X_char_te).reshape((len(X_char_te), MAX_LEN, max_len_char))
])
pred = np.argmax(pred_cat, axis=-1)
y_te_true = np.argmax(y_te, -1)

# # Convert the index to tag
pred_tag = [[idx2tag[i] for i in row] for row in pred]
y_te_true_tag = [[idx2tag[i] for i in row] for row in y_te_true]

report = flat_classification_report(y_pred=pred_tag, y_true=y_te_true_tag)
print(report)
예제 #2
0
    pos_emb = Embedding(input_dim=len(pos),
                        output_dim=10,
                        input_length=max_len)(pos_input)
    modified_input = keras.layers.concatenate([word_emb, pos_emb])
    model_1 = Bidirectional(
        LSTM(units=50, return_sequences=True,
             recurrent_dropout=0.1))(modified_input)
    model = TimeDistributed(Dense(50, activation="relu"))(
        model_1)  # a dense layer as suggested by neuralNer
    crf = CRF(n_tags)  # CRF layer
    out = crf(model)  # output
    model = Model([input, pos_input], out)
    model.compile(optimizer="rmsprop",
                  loss=crf.loss_function,
                  metrics=[crf.accuracy])
    print(model.summary())
    history = model.fit([X_tr, X_pos_tr],
                        np.array(y_tr),
                        batch_size=32,
                        epochs=60,
                        validation_split=0.1,
                        verbose=1)
    #Testing
    test_pred = model.predict([X_te, X_pos_te], verbose=1)
    idx2tag = {i: w for w, i in tag2idx.items()}
    pred_labels = pred2label(test_pred)
    test_labels = pred2label(y_te)
    print("Recall, Precision and F-score are",
          get_recall_precision(test_labels, pred_labels, "Destination"))
    model.save("BILSTM+CRF_with_pos_without_embeddings.model")
예제 #3
0
    def test_exist(self, glove, test_data, test_labels):
        # get word embeddings
        utils = wordUtils.Utils()

        if glove:
            # use glove
            self.words_list, self.embedding_matrix = utils.load_glove()
            unword_n = len(self.words_list)

        else:
            self.words_list, self.embedding_matrix = utils.load_word2vec()
            unword_n = len(self.words_list)

        # get the training corpus
        cr = corpusreader.CorpusReader(test_data, test_labels)
        corpus = cr.trainseqs

        # get the number of the embedding
        for idx in range(len(corpus)):
            words = corpus[idx]['tokens']
            words_id = []
            for i in words:

                # get the number of the embedding
                try:
                    # the index of the word in the embedding matrix
                    index = self.words_list.index(i)
                except ValueError:
                    # use the embedding full of zeros to identify an unknown word
                    index = unword_n

                # the index of the word in the embedding matrix
                words_id.append(index)

            corpus[idx]['embs'] = words_id

        input = Input(shape=(None,))
        el = Embedding(len(self.words_list) + 1, 200, weights=[self.embedding_matrix], trainable=False)(input)
        bl1 = Bidirectional(LSTM(128, return_sequences=True, recurrent_dropout=0.5, dropout=0.5),
                            merge_mode="concat",
                            name="lstm1")(el)
        bl2 = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.5, dropout=0.5),
                            merge_mode="concat",
                            name="lstm2")(bl1)
        bl3 = Bidirectional(LSTM(64, return_sequences=True, recurrent_dropout=0.5, dropout=0.5),
                            merge_mode="concat",
                            name="lstm3")(bl2)
        model = TimeDistributed(Dense(50, activation="relu"))(bl3)  # a dense layer as suggested by neuralNer
        crf = CRF(self.lab_len)  # CRF layer
        out = crf(model)  # output

        model = Model(input, out)
        model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
        model.summary()
        save_load_utils.load_all_weights(model, 'word_models/words_glove_multiLSTM31.h5')

        for doc in corpus:
            doc_arr = doc['embs']
            p = model.predict(np.array([doc_arr]))
            p = np.argmax(p, axis=-1)

            position = 0
            offsets = defaultdict(list)
            counter = 0
            # check if there are any mutations identified
            # {'O': 0, 'B-E': 1, 'I-E': 2, 'E-E': 3, 'S-E': 4}
            B = False
            last = 0
            for idx in p[0]:
                if idx == 1 and last == 1:
                    counter = counter + 1
                    offsets[counter].append(position)
                    B = True
                elif idx == 1:
                    B = True
                    offsets[counter].append(position)
                    last = 1
                elif idx == 2 and B:
                    offsets[counter].append(position)
                    last = 2
                elif idx == 3 and B:
                    offsets[counter].append(position)
                    last = 3
                    B = False
                    counter = counter + 1
                elif idx == 4:
                    offsets[counter].append(position)
                    counter = counter + 1
                    last = 4
                else:
                    B = False

                position = position + 1

            # open file to write
            textid = str(doc['textid'])
            abstract = open("words-silver/" + textid + ".a1", 'w')
            for i in offsets:
                word = offsets.get(i)
                size = len(word)
                if size == 1:
                    s = word[0]  # just one; singleton

                    abstract.write(str(doc['tokstart'][s]) + "\t")
                    abstract.write(str(doc['tokend'][s]) + "\t")
                    abstract.write(str(doc['tokens'][s]) + "\n")


                elif size > 1:
                    s = word[0]  # start of token
                    e = word[-1]  # end of token

                    abstract.write(str(doc['tokstart'][s]) + "\t")
                    abstract.write(str(doc['tokend'][e]) + "\t")
                    token = ""
                    for c in word:
                        token = token + doc['tokens'][c]

                    abstract.write(str(token) + "\n")