예제 #1
0
#from bert.extract_feature import BertVector
from albert_zh.extract_feature import BertVector
from dataset_pro import read_dictionary, random_embedding, label_id, read_data, data_generate
from dataset import data_trans

MAX_SEQ_LEN = 200  #训练集中最长语句长度为1080

# 读取训练集,验证集和测试集原始数据

label2id = label_id()
train_data = read_data("dataset_pro/train.csv")
dev_data = read_data("dataset_pro/dev.csv")
test_data = read_data("dataset_pro/test.csv")
word2id = read_dictionary("dataset_pro/train.pkl")

_, origin_train_X, origin_train_y = data_trans('dataset/train.txt')
_, origin_dev_X, origin_dev_y = data_trans('dataset/dev.txt')
_, origin_test_X, origin_test_y = data_trans('dataset/test.txt')

train_sent = []
train_tag = []
for (sent_, tag_) in train_data:
    train_sent.append(''.join(sent_))
    train_tag.append(tag_)

dev_sent = []
dev_tag = []
for (sent_, tag_) in dev_data:
    dev_sent.append(''.join(sent_))
    dev_tag.append(tag_)
예제 #2
0
def load_data(filename):
    new_list, text_list, tag_list = data_trans(filename)
    data = spo_generate(new_list, text_list)
    return data
예제 #3
0
from model import w2v
from model import label_id_dict


model = load_model('lstm_crf_ner_0610_3.h5',
                   custom_objects = {"CRF": CRF,
                                     'crf_loss': crf_loss,
                                     'crf_viterbi_accuracy': crf_viterbi_accuracy})


_, _, test_x, _ = w2v()
id_label_dict = {v:k for k,v in label_id_dict.items()}

y = np.argmax(model.predict(test_x), axis=2)
pred_tags = []
for i in range(y.shape[0]):
    pred_tags.append([id_label_dict[_] for _ in y[i] if _])

# 因为存在预测的标签长度与原来的标注长度不一致的情况,因此需要调整预测的标签
test_sents, test_tags = data_trans('dataset/test.txt')
final_tags = []
for test_tag, pred_tag in zip(test_tags, pred_tags):
    if len(test_tag) == len(pred_tag):
        final_tags.append(pred_tag)
    elif len(test_tag) < len(pred_tag):
        final_tags.append(pred_tag[:len(test_tag)])
    else:
        final_tags.append(pred_tag + ['O'] * (len(test_tag) - len(pred_tag)))

# 利用seqeval对测试集进行验证
print(classification_report(test_tags, final_tags, digits=4))