예제 #1
0
def bilstm_character(train_loc, test_loc):
    train_pre = preprocess(train_loc)
    test_pre = preprocess(test_loc)
    cc_train = cuu(train_pre)
    cc_test = cuu(test_pre)
    words_all, tags_all = combine_all(cc_train, cc_test)
    n_tags = len(tags_all)
    n_words = len(words_all)

    max_len = 130
    max_len_char = 10

    word2idx = {w: i + 2 for i, w in enumerate(words_all)}
    word2idx["UNK"] = 1
    word2idx["PAD"] = 0
    idx2word = {i: w for w, i in word2idx.items()}
    tag2idx = {t: i + 1 for i, t in enumerate(tags_all)}
    tag2idx["PAD"] = 0
    idx2tag = {i: w for w, i in tag2idx.items()}

    X_word = [[word2idx[w[0]] for w in s] for s in cc_train]
    X_word = pad_sequences(maxlen=max_len, sequences=X_word, value=word2idx["PAD"], padding='post', truncating='post')

    chars = set([w_i for w in words_all for w_i in w])
    n_chars = len(chars)
    char2idx = {c: i + 2 for i, c in enumerate(chars)}
    char2idx["UNK"] = 1
    char2idx["PAD"] = 0

    X_char = x_char(cc_train, max_len, max_len_char, char2idx)

    y = [[tag2idx[w[1]] for w in s] for s in cc_train]
    y = pad_sequences(maxlen=max_len, sequences=y, value=tag2idx["PAD"], padding='post', truncating='post')

    # input and embedding for words
    word_in = Input(shape=(max_len,))
    emb_word = Embedding(input_dim=n_words + 2, output_dim=20,
                         input_length=max_len, mask_zero=True)(word_in)

    # input and embeddings for characters
    char_in = Input(shape=(max_len, max_len_char,))
    emb_char = TimeDistributed(Embedding(input_dim=n_chars + 2, output_dim=10,
                                         input_length=max_len_char, mask_zero=True))(char_in)
    # character LSTM to get word encodings by characters
    char_enc = TimeDistributed(LSTM(units=20, return_sequences=False,
                                    recurrent_dropout=0.5))(emb_char)

    # main LSTM
    x = concatenate([emb_word, char_enc])
    x = SpatialDropout1D(0.3)(x)
    main_lstm = Bidirectional(LSTM(units=50, return_sequences=True,
                                   recurrent_dropout=0.6))(x)
    out = TimeDistributed(Dense(n_tags + 1, activation="softmax"))(main_lstm)
    model = Model([word_in, char_in], out)
    model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["acc"])
    model.summary()
    history = model.fit([X_word,
                         np.array(X_char).reshape((len(X_char), max_len, max_len_char))],
                        np.array(y).reshape(len(y), max_len, 1),
                        batch_size=2, epochs=15, validation_split=0.1, verbose=1)
    y_pred = model.predict([X_word,
                            np.array(X_char).reshape((len(X_char),
                                                      max_len, max_len_char))])

    idx2tag = {i: w for w, i in tag2idx.items()}
    pred_labels = pred2label(y_pred, idx2tag)
    true_labels = original(y, idx2tag)

    f1_train = f1_score(true_labels, pred_labels)
    precision_train = precision_score(true_labels, pred_labels)
    recall_train = recall_score(true_labels, pred_labels)
    train_scores = [f1_train,precision_train,recall_train]
    print('Training :       ')
    print("F1-score: {:.1%}".format(f1_score(true_labels, pred_labels)))
    print('Precision-score: {:.1%}'.format(precision_score(true_labels, pred_labels)))
    print('Recall-score: {:.1%}'.format(recall_score(true_labels, pred_labels)))

    X_word1 = [[word2idx[w[0]] for w in s] for s in cc_test]
    X_word1 = pad_sequences(maxlen=max_len, sequences=X_word1, value=word2idx["PAD"], padding='post', truncating='post')
    X_char1 = x_char(cc_test, max_len, max_len_char, char2idx)

    y2 = [[tag2idx[w[1]] for w in s] for s in cc_test]
    y2 = pad_sequences(maxlen=max_len, sequences=y2, value=tag2idx["PAD"], padding='post', truncating='post')
    y_pred1 = model.predict([X_word1,
                             np.array(X_char1).reshape((len(X_char1),
                                                        max_len, max_len_char))])
    idx2tag = {i: w for w, i in tag2idx.items()}
    pred_labels1 = pred2label(y_pred1, idx2tag)
    true_labels1 = original(y2, idx2tag)

    f1_test = f1_score(true_labels1, pred_labels1)
    precision_test = precision_score(true_labels1, pred_labels1)
    recall_test = recall_score(true_labels1, pred_labels1)
    test_scores = [f1_test,precision_test,recall_test]
    print('Testing :       ')
    print("F1-score: {:.1%}".format(f1_score(true_labels1, pred_labels1)))
    print('Precision-score: {:.1%}'.format(precision_score(true_labels1, pred_labels1)))
    print('Recall-score: {:.1%}'.format(recall_score(true_labels1, pred_labels1)))

    return train_scores, test_scores
예제 #2
0
def bilstm_crf(train_loc, test_loc):
    train_pre = preprocess(train_loc)
    test_pre = preprocess(test_loc)
    cc_train = cuu(train_pre)
    cc_test = cuu(test_pre)
    words_all, tags_all = combine_all(cc_train, cc_test)
    n_words = len(words_all)
    n_tags = len(tags_all)

    max_len = 130
    word2idx = {w: i for i, w in enumerate(words_all)}
    tag2idx = {t: i for i, t in enumerate(tags_all)}

    X = [[word2idx[w[0]] for w in s] for s in cc_train]
    X = pad_sequences(maxlen=max_len,
                      sequences=X,
                      padding="post",
                      value=n_words - 1)
    X1 = [[word2idx[w[0]] for w in s] for s in cc_test]
    X1 = pad_sequences(maxlen=max_len,
                       sequences=X1,
                       padding="post",
                       value=n_words - 1)
    y = [[tag2idx[w[1]] for w in s] for s in cc_train]
    y = pad_sequences(maxlen=max_len,
                      sequences=y,
                      padding="post",
                      value=tag2idx["O"])
    y1 = [[tag2idx[w[1]] for w in s] for s in cc_test]
    y1 = pad_sequences(maxlen=max_len,
                       sequences=y1,
                       padding="post",
                       value=tag2idx["O"])
    y = [to_categorical(i, num_classes=n_tags) for i in y]

    input = Input(shape=(max_len, ))
    model = Embedding(input_dim=n_words + 1,
                      output_dim=50,
                      input_length=max_len,
                      mask_zero=True)(input)  # 20-dim embedding
    model = Bidirectional(
        LSTM(units=250, return_sequences=True,
             recurrent_dropout=0.2))(model)  # variational biLSTM
    model = TimeDistributed(Dense(50, activation="relu"))(
        model)  # a dense layer as suggested by neuralNer
    crf = CRF(n_tags)  # CRF layer
    out = crf(model)  # output
    model = Model(input, out)
    model.compile(optimizer="adam",
                  loss=crf.loss_function,
                  metrics=[crf.accuracy])
    model.summary()
    history = model.fit(X, np.array(y), batch_size=4, epochs=15, verbose=1)
    test_pred = model.predict(X, verbose=1)
    idx2tag = {i: w for w, i in tag2idx.items()}

    pred_labels = pred2label(test_pred, idx2tag)
    true_labels = pred2label(y, idx2tag)
    f1_train = f1_score(true_labels, pred_labels)
    precision_train = precision_score(true_labels, pred_labels)
    recall_train = recall_score(true_labels, pred_labels)
    train_scores = [f1_train, precision_train, recall_train]

    y1 = [to_categorical(i, num_classes=n_tags) for i in y1]
    test_pred1 = model.predict(X1, verbose=1)
    pred_labels1 = pred2label(test_pred1, idx2tag)
    true_labels1 = pred2label(y1, idx2tag)
    f1_test = f1_score(true_labels1, pred_labels1)
    precision_test = precision_score(true_labels1, pred_labels1)
    recall_test = recall_score(true_labels1, pred_labels1)
    test_scores = [f1_test, precision_test, recall_test]
    print('Testing scores:', test_scores)
    return test_scores
예제 #3
0
def bilstm_elmo(train_loc, test_loc):
    # train_pre = preprocess('/data/xwang/models_origin/convertedBIO/combinedTrain.txt')
    # test_pre = preprocess('/data/xwang/models_origin/convertedBIO/combinedTrain.txt')
    train_pre = preprocess(train_loc)
    test_pre = preprocess(test_loc)
    cc_train = cuu(train_pre)
    cc_test = cuu(test_pre)
    words_all, tags_all = combine_all(cc_train, cc_test)
    n_tags = len(tags_all)
    n_words = len(words_all)

    max_len = 130
    tag2idx = {t: i for i, t in enumerate(tags_all)}
    X = [[w[0] for w in s] for s in cc_train]
    new_X = []
    for seq in X:
        new_seq = []
        for i in range(max_len):
            try:
                new_seq.append(seq[i])
            except:
                new_seq.append("__PAD__")
        new_X.append(new_seq)
    X = new_X
    y = [[tag2idx[w[1]] for w in s] for s in cc_train]
    y = pad_sequences(maxlen=max_len,
                      sequences=y,
                      padding="post",
                      value=tag2idx["O"])
    batch_size = 32

    sess = tf.Session()
    K.set_session(sess)
    elmo_model = hub.Module("/data/xwang/module_elmo2", trainable=True)
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())

    def ElmoEmbedding(x):
        return elmo_model(inputs={
            "tokens": tf.squeeze(tf.cast(x, tf.string)),
            "sequence_len": tf.constant(batch_size * [max_len])
        },
                          signature="tokens",
                          as_dict=True)["elmo"]

    input_text = Input(shape=(max_len, ), dtype=tf.string)
    embedding = Lambda(ElmoEmbedding, output_shape=(max_len, 1024))(input_text)
    x = Bidirectional(
        LSTM(units=512,
             return_sequences=True,
             recurrent_dropout=0.2,
             dropout=0.2))(embedding)
    x_rnn = Bidirectional(
        LSTM(units=512,
             return_sequences=True,
             recurrent_dropout=0.2,
             dropout=0.2))(x)
    x = add([x, x_rnn])  # residual connection to the first biLSTM
    out = TimeDistributed(Dense(n_tags, activation="softmax"))(x)
    model = Model(input_text, out)
    model.compile(optimizer="adam",
                  loss="sparse_categorical_crossentropy",
                  metrics=["accuracy"])
    model.summary()
    train_num = (len(X) // batch_size) * batch_size
    print(n_words, n_tags, len(X), len(y), train_num)

    X_tr = X[:train_num]
    y_tr = y[:train_num]
    y_tr = y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1)
    history = model.fit(np.array(X_tr),
                        y_tr,
                        batch_size=batch_size,
                        epochs=7,
                        verbose=1)
    idx2tag = {i: w for w, i in tag2idx.items()}
    '''test_pred = model.predict(np.array(X_tr), verbose=1)
    # predicted labels
    pred_labels = pred2label(test_pred, idx2tag)
    true_labels = original(y, idx2tag)

    f1_train = f1_score(true_labels, pred_labels)
    precision_train = precision_score(true_labels, pred_labels)
    recall_train = recall_score(true_labels, pred_labels)
    train_scores = [f1_train, precision_train, recall_train]
    print('Training:', train_scores)'''

    X2 = [[w[0] for w in s] for s in cc_test]
    print(len(X2))
    new_X = []
    for seq in X2:
        new_seq = []
        for i in range(max_len):
            try:
                new_seq.append(seq[i])
            except:
                new_seq.append("__PAD__")
        new_X.append(new_seq)
    test_num = (len(new_X) // batch_size) * batch_size
    print(len(X2), len(new_X), test_num)
    X2 = new_X[:test_num]
    y2 = [[tag2idx[w[1]] for w in s] for s in cc_test]
    y2 = pad_sequences(maxlen=max_len,
                       sequences=y2,
                       padding="post",
                       value=tag2idx["O"])
    y2 = y2[:test_num]
    test_pred1 = model.predict(np.array(X2), verbose=1)
    pred_labels1 = pred2label(test_pred1, idx2tag)
    true_labels1 = original(y2, idx2tag)

    f1_test = f1_score(true_labels1, pred_labels1)
    precision_test = precision_score(true_labels1, pred_labels1)
    recall_test = recall_score(true_labels1, pred_labels1)
    test_scores = [f1_test, recall_test, precision_test]
    print('Testing:', test_scores)
    return test_scores