示例#1
0
def predict(inputTextList):
    cnt = 0
    print("begin")
    # 加载训练效果最好的模型
    model_dir = './models'
    files = os.listdir(model_dir)
    models_path = [os.path.join(model_dir, _) for _ in files]
    best_model_path = sorted(
        models_path,
        key=lambda x: float(x.split('-')[-1].replace('.h5', '')),
        reverse=True)[0]
    print("the best model is", best_model_path)
    model = load_model(best_model_path,
                       custom_objects={"Attention": Attention})
    # 利用BERT提取句子特征
    bert_model = BertVector(pooling_strategy="NONE", max_seq_len=80)
    print("the bert model for sentence vector is ready")
    return_List = []

    for inputText in inputTextList:
        try:
            per1, per2, doc = inputText.split('#')
            text = '$'.join([
                per1, per2,
                doc.replace(per1,
                            len(per1) * '#').replace(per2,
                                                     len(per2) * '#')
            ])
            '''
            print("example text:")
            print(text)
            '''
            vec = bert_model.encode([text])["encodes"][0]
            x_train = np.array([vec])

            # 模型预测并输出预测结果
            predicted = model.predict(x_train)
            y = np.argmax(predicted[0])

            with open('data/rel_dict.json', 'r', encoding='utf-8') as f:
                rel_dict = json.load(f)

            id_rel_dict = {v: k for k, v in rel_dict.items()}
            '''
            if id_rel_dict[y] != "unknown":
                print('原文: %s' % inputText)
                print('预测人物关系: %s' % id_rel_dict[y])
            '''
            return_List.append([per1, per2, id_rel_dict[y], doc])
        except:
            print("error")
            return_List.append([per1, per2, "", doc])
            continue
    return return_List
示例#2
0
from keras.layers import Input, Dense
from keras.callbacks import EarlyStopping
from att import Attention
from keras.layers import GRU, LSTM, Bidirectional
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt

from load_data import get_train_test_pd
from bert.extract_feature import BertVector


# 读取文件并进行转换
train_df, test_df = get_train_test_pd()
bert_model = BertVector(pooling_strategy="NONE", max_seq_len=80)
print('begin encoding')
f = lambda text: bert_model.encode([text])["encodes"][0]

train_df['x'] = train_df['text'].apply(f)
test_df['x'] = test_df['text'].apply(f)
print('end encoding')

# 训练集和测试集
x_train = np.array([vec for vec in train_df['x']])
x_test = np.array([vec for vec in test_df['x']])
y_train = np.array([vec for vec in train_df['label']])
y_test = np.array([vec for vec in test_df['label']])
# print('x_train: ', x_train.shape)

# 将类型y值转化为ont-hot向量
num_classes = 14
y_train = to_categorical(y_train, num_classes)
from InputPassage_OutputSentence import outfile
load_model = load_model("model/question_sentence_classify_20000.h5")

# 预测语句
f = open(outfile, encoding="UTF-8")
texts = []
for line in f.readlines():
    texts.append(line.strip())
labels = []
true_posibilities = []
bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=70)

# 对上述句子进行预测
for text in texts:
    # 将句子转换成向量
    vec = bert_model.encode([text])["encodes"][0]
    x_train = np.array([vec])
    # 模型预测
    predicted = load_model.predict(x_train)
    # print("predicted:",predicted)
    y = np.argmax(predicted[0])
    print("y:", y)
    label = '1' if y else '0'
    true_posibility = predicted[0][1]
    # if y==1:
    #     posibility = predicted[0][1]
    #     true_posibilities.append(true_posibility)
    # else:
    #     false_posibility=predicted[0][0]
    #     false_posibilities.append(false_posibility)
    labels.append(label)
示例#4
0
class BertClassification(object):
    def __init__(self,
                 nb_classes=2,
                 gru_dim=128,
                 dense_dim=128,
                 max_len=100,
                 batch_size=128,
                 epochs=10,
                 train_corpus_path="data/sent.train",
                 test_corpus_path="data/sent.test",
                 save_weights_file="./model/weights_lstm.h5"):
        self.nb_classes = nb_classes
        self.gru_dim = gru_dim
        self.dense_dim = dense_dim
        self.max_len = max_len
        self.batch_size = batch_size
        self.epochs = epochs
        self.train_corpus_path=train_corpus_path
        self.test_corpus_path=test_corpus_path
        self.save_weights_file = save_weights_file

        self.nb_samples = 25000 # 样本数
        self.bert_model = BertVector(pooling_strategy="NONE", 
                                     max_seq_len=self.max_len, 
                                     bert_model_path="./chinese_L-12_H-768_A-12/",
280                                  graph_tmpfile="./tmp_graph_xxx)

    def text2bert(self, text):
        """ 将文本转换为bert向量  """
        vec = self.bert_model.encode([text])
        return vec["encodes"][0]

    def data_format(self, lines):
        X, y = [], []
        for line in lines:
            line = line.strip().split("\t")
            label = int(line[0])
            content = line[1]
            vec = self.text2bert(content)
            X.append(vec)
            y.append(label)
        X = np.array(X)
        y = np_utils.to_categorical(np.asarray(y), num_classes=self.nb_classes)
        return X, y

    def data_iter(self):
        """ 数据生成器 """
        fr = codecs.open(self.train_corpus_path, "r", "utf-8")
        lines = fr.readlines()
        fr.close()
        random.shuffle(lines)
        while True:
            for index in range(0, len(lines), self.batch_size):
                batch_samples = lines[index: index+self.batch_size]
                X, y = self.data_format(batch_samples)
                yield (X, y)

    def data_val(self):
        """ 测试数据 """
        fr = codecs.open(self.test_corpus_path, "r", "utf-8")
        lines = fr.readlines()
        fr.close()
        random.shuffle(lines)
        X,y = self.data_format(lines)
        return X,y

    def create_model(self):
        x_in = Input(shape=(self.max_len, 768, ))
        x_out = Masking(mask_value=0.0)(x_in)
        x_out = GRU(self.gru_dim, dropout=0.25, recurrent_dropout=0.25)(x_out)
        x_out = Dense(self.dense_dim, activation="relu")(x_out)
        x_out = BatchNormalization()(x_out)
        x_out = Dense(self.nb_classes, activation="softmax")(x_out)
        model = Model(inputs=x_in, outputs=x_out)
        return model

    def train(self):
        model = self.create_model()
        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(lr=0.001),
                      metrics=['accuracy'])

        checkpoint = ModelCheckpoint(self.save_weights_file, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
        x_test, y_test = self.data_val()
        steps_per_epoch = int(self.nb_samples/self.batch_size)+1
        model.fit_generator(self.data_iter(),
                            steps_per_epoch=steps_per_epoch,
                            epochs=self.epochs,
                            verbose=1,
                            validation_data=(x_test, y_test),
                            validation_steps=None,
                            callbacks=[checkpoint]
                            )
示例#5
0
class BertClassification(object):
    def __init__(self,
                 nb_classes=143,
                 dense_dim=256,
                 max_len=128,
                 batch_size=32,
                 epochs=30,
                 train_corpus_path="data/sent.train",
                 test_corpus_path="data/sent.test",
                 weights_file_path="./model/bertweights_fc.h5"):
        self.nb_classes = nb_classes
        self.dense_dim = dense_dim
        self.max_len = max_len
        self.batch_size = batch_size
        self.epochs = epochs
        self.weights_file_path = weights_file_path
        self.train_corpus_path = train_corpus_path
        self.test_corpus_path = test_corpus_path

        self.nb_samples = 46985  # 样本数
        self.bert_model = BertVector(
            pooling_strategy="REDUCE_MEAN",
            max_seq_len=self.max_len,
            bert_model_path=
            r"D:\赵鲸朋\pycharmModel0905\pycharmModel0905\keras_bert_classification\uncased_L-12_H-768_A-12",
            graph_tmpfile="./data/output/tmp_graph_xxx")

    def text2bert(self, text):
        """ 将文本转换为bert向量  """
        vec = self.bert_model.encode([text])
        return vec["encodes"][0]

    #############################################################################################

    #############################################################################################
    def data_format(self, lines):
        """ 将数据转换为训练格式,输入为列表  """
        X, y = [], []
        for line in lines:
            line = line.strip().split("\t")
            # label = int(line[0])
            label = wosy2_to_id[line[0]]
            content = line[1]
            vec = self.text2bert(content)
            X.append(vec)
            y.append(label)
        X = np.array(X)
        y = np_utils.to_categorical(np.asarray(y), num_classes=self.nb_classes)
        return X, y

    def data_iter(self):
        """ 数据生成器 """
        # fr = codecs.open(self.train_corpus_path, "r", "utf-8")
        # lines = fr.readlines()
        # fr.close()
        # random.shuffle(lines)
        lines = train_labcont
        while True:
            for index in range(0, len(lines), self.batch_size):
                batch_samples = lines[index:index + self.batch_size]
                X, y = self.data_format(batch_samples)
                yield (X, y)

    def data_val(self):
        """ 测试数据 """
        # fr = codecs.open(self.test_corpus_path, "r", "utf-8")
        # lines = fr.readlines()
        # fr.close()
        # random.shuffle(lines)
        lines = test_labcont
        X, y = self.data_format(lines)
        return X, y

    def create_model(self):
        x_in = Input(shape=(768, ))
        # tanh
        x_out = Dense(self.dense_dim, activation="relu")(x_in)
        x_out = BatchNormalization()(x_out)
        x_out = Dense(self.nb_classes, activation="softmax")(x_out)
        model = Model(inputs=x_in, outputs=x_out)
        return model

    def train(self):
        model = self.create_model()
        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(),
                      metrics=['accuracy'])
        model.summary()
        checkpoint = ModelCheckpoint(self.weights_file_path,
                                     monitor='val_acc',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')
        early_stopping = EarlyStopping(monitor='val_acc',
                                       patience=2,
                                       mode='max')
        x_test, y_test = self.data_val()
        model.fit_generator(
            self.data_iter(),
            steps_per_epoch=int(self.nb_samples / self.batch_size) + 1,
            epochs=self.epochs,
            verbose=1,
            validation_data=(x_test, y_test),
            validation_steps=None,
            callbacks=[checkpoint, early_stopping])
示例#6
0
#coding:utf-8

from bert.extract_feature import BertVector

pooling_strategy = "REDUCE_MEAN"
#pooling_strategy = "NONE"
bc = BertVector(pooling_strategy=pooling_strategy, max_seq_len=80)
s1 = '人 同 去 福田 图书馆 啊 在 家 写 作业 巨 没 feel , 我 的 作业'
s2 = "人同去福田图书馆啊在家写作业巨没feel,我的作业"
v = bc.encode([s1])
v1 = v["encodes"][0]
print(v1)
v = bc.encode([s2])
v2 = v["encodes"][0]
print(v2)
class BertClassification(object):
    def __init__(self,
                 nb_classes=3,
                 dense_dim=256,
                 max_len=100,
                 batch_size=128,
                 epochs=50,
                 train_corpus_path="data/train.csv",
                 test_corpus_path="data/dev.csv",
                 weights_file_path="./model/weights_fc.h5"):
        self.nb_classes = nb_classes
        self.dense_dim = dense_dim
        self.max_len = max_len
        self.batch_size = batch_size
        self.epochs = epochs
        self.weights_file_path = weights_file_path
        self.train_corpus_path = train_corpus_path
        self.test_corpus_path = test_corpus_path

        self.nb_samples = 17  # 样本数    D:\NLP项目\bert模型\chinese_L-12_H-768_A-12
        self.bert_model = BertVector(pooling_strategy="REDUCE_MEAN",
                                     max_seq_len=self.max_len,
                                     bert_model_path="chinese_L-12_H-768_A-12",
                                     graph_tmpfile="./tmp_graph_xxx")

    def text2bert(self, text):
        """ 将文本转换为bert向量  """
        vec = self.bert_model.encode([text])
        return vec["encodes"][0]

    def data_format(self, lines):
        """ 将数据转换为训练格式,输入为列表  """
        X, y = [], []
        for line in lines:
            line = line.strip().split(",")
            try:
                label = int(line[4])
                content = line[2]
                vec = self.text2bert(content)
                X.append(vec)
                y.append(label)
            except:
                print(line[0])

        X = np.array(X)
        y = np_utils.to_categorical(np.asarray(y), num_classes=self.nb_classes)
        return X, y

    def data_iter(self):
        """ 数据生成器 """
        fr = codecs.open(self.train_corpus_path, "r", "utf-8")  # 训练集在这里
        lines = fr.readlines()
        fr.close()
        random.shuffle(lines)
        while True:
            for index in range(0, len(lines), self.batch_size):
                batch_samples = lines[index:index + self.batch_size]
                X, y = self.data_format(batch_samples)
                yield (X, y)

    def data_val(self):
        """ 测试数据 """
        fr = codecs.open(self.test_corpus_path, "r", "utf-8")
        lines = fr.readlines()
        fr.close()
        random.shuffle(lines)
        X, y = self.data_format(lines)
        return X, y

    def create_model(self):
        x_in = Input(shape=(768, ))
        x_out = Dense(self.dense_dim, activation="relu")(x_in)
        x_out = BatchNormalization()(x_out)
        x_out = Dense(self.nb_classes, activation="softmax")(x_out)  # 这里是分类
        model = Model(inputs=x_in, outputs=x_out)
        return model

    def train(self):
        model = self.create_model()
        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(),
                      metrics=['accuracy'])

        checkpoint = ModelCheckpoint(self.weights_file_path,
                                     monitor='val_acc',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')
        x_test, y_test = self.data_val()

        model.fit_generator(
            self.data_iter(),  # 训练集数据迭代器
            steps_per_epoch=int(self.nb_samples / self.batch_size) +
            1,  # batch_size=128
            epochs=self.epochs,
            verbose=1,
            validation_data=(x_test, y_test),
            validation_steps=None,
            callbacks=[checkpoint])
        pred = model.predict(x_test)
        pred = [np.argmax(val) for val in pred]
        print(pred)  # [1, 0, 1, 0, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 2, 2, 1]
        y_true = []
        for val in y_test:
            y_true.append(np.argmax(val))
        print(y_true)

        p = precision_score(y_true, pred, average='macro')
        r = recall_score(y_true, pred, average='macro')
        f1 = f1_score(y_true, pred, average='macro')
        print(p)
        print(r)
        print(f1)
示例#8
0
## 将原始数据集打乱,并分成训练集和验证集(本次实验从原始样本中,选取200个作为训练集,10000个作为验证集)

idxs = np.random.randint(0, len(texts), size=10200)

X = []
y = []
for id in idxs:
    X.append(texts[id])
    y.append(labels[id])

X_VEC = []
## 使用BERT进行向量化
print("star encoding...")
bert_model = BertVector(pooling_strategy="NONE", max_seq_len=100)
for text in X:
    X_VEC.append(bert_model.encode([text])["encodes"][0])

X_VEC_CLS = []
for vec in X_VEC:
    X_VEC_CLS.append(vec[0])

x_train = np.array(X_VEC_CLS[:8000])
x_test = np.array(X_VEC_CLS[8000:])
y_train = np.array(y[:8000])
y_test = np.array(y[8000:])

# 训练集测试集构建完成,开始准备构建模型

print("star training...")
from keras.models import Sequential
from keras.layers import Dense
# -*- coding: utf-8 -*-
# author: Jclian91
# place: Pudong Shanghai
# time: 2020-02-12 12:45
from bert.extract_feature import BertVector

bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=100)

import time
t1 = time.time()
for _ in range(100):
    print(_)
    text = ['英国苏格兰政府首席大臣、苏格兰民族党党魁妮古拉·斯特金11日在伦敦说,苏格兰人应有权重新选择是否独立。'] * 1000
    vec = bert_model.encode(text)["encodes"][0]

t2 = time.time()
print(t2 - t1)