Пример #1
0
    def __init__(self,
                 nb_classes=143,
                 dense_dim=256,
                 max_len=128,
                 batch_size=32,
                 epochs=30,
                 train_corpus_path="data/sent.train",
                 test_corpus_path="data/sent.test",
                 weights_file_path="./model/bertweights_fc.h5"):
        self.nb_classes = nb_classes
        self.dense_dim = dense_dim
        self.max_len = max_len
        self.batch_size = batch_size
        self.epochs = epochs
        self.weights_file_path = weights_file_path
        self.train_corpus_path = train_corpus_path
        self.test_corpus_path = test_corpus_path

        self.nb_samples = 46985  # 样本数
        self.bert_model = BertVector(
            pooling_strategy="REDUCE_MEAN",
            max_seq_len=self.max_len,
            bert_model_path=
            r"D:\赵鲸朋\pycharmModel0905\pycharmModel0905\keras_bert_classification\uncased_L-12_H-768_A-12",
            graph_tmpfile="./data/output/tmp_graph_xxx")
Пример #2
0
    def __init__(self,
                 nb_classes=2,
                 gru_dim=128,
                 dense_dim=128,
                 max_len=100,
                 batch_size=128,
                 epochs=10,
                 train_corpus_path="data/sent.train",
                 test_corpus_path="data/sent.test",
                 save_weights_file="./model/weights_lstm.h5"):
        self.nb_classes = nb_classes
        self.gru_dim = gru_dim
        self.dense_dim = dense_dim
        self.max_len = max_len
        self.batch_size = batch_size
        self.epochs = epochs
        self.train_corpus_path=train_corpus_path
        self.test_corpus_path=test_corpus_path
        self.save_weights_file = save_weights_file

        self.nb_samples = 25000 # 样本数
        self.bert_model = BertVector(pooling_strategy="NONE", 
                                     max_seq_len=self.max_len, 
                                     bert_model_path="./chinese_L-12_H-768_A-12/",
280                                  graph_tmpfile="./tmp_graph_xxx)
Пример #3
0
def predict(inputTextList):
    cnt = 0
    print("begin")
    # 加载训练效果最好的模型
    model_dir = './models'
    files = os.listdir(model_dir)
    models_path = [os.path.join(model_dir, _) for _ in files]
    best_model_path = sorted(
        models_path,
        key=lambda x: float(x.split('-')[-1].replace('.h5', '')),
        reverse=True)[0]
    print("the best model is", best_model_path)
    model = load_model(best_model_path,
                       custom_objects={"Attention": Attention})
    # 利用BERT提取句子特征
    bert_model = BertVector(pooling_strategy="NONE", max_seq_len=80)
    print("the bert model for sentence vector is ready")
    return_List = []

    for inputText in inputTextList:
        try:
            per1, per2, doc = inputText.split('#')
            text = '$'.join([
                per1, per2,
                doc.replace(per1,
                            len(per1) * '#').replace(per2,
                                                     len(per2) * '#')
            ])
            '''
            print("example text:")
            print(text)
            '''
            vec = bert_model.encode([text])["encodes"][0]
            x_train = np.array([vec])

            # 模型预测并输出预测结果
            predicted = model.predict(x_train)
            y = np.argmax(predicted[0])

            with open('data/rel_dict.json', 'r', encoding='utf-8') as f:
                rel_dict = json.load(f)

            id_rel_dict = {v: k for k, v in rel_dict.items()}
            '''
            if id_rel_dict[y] != "unknown":
                print('原文: %s' % inputText)
                print('预测人物关系: %s' % id_rel_dict[y])
            '''
            return_List.append([per1, per2, id_rel_dict[y], doc])
        except:
            print("error")
            return_List.append([per1, per2, "", doc])
            continue
    return return_List
Пример #4
0
    def __init__(self,
                 nb_classes=2,
                 dense_dim=256,
                 max_len=100,
                 batch_size=128,
                 epochs=5,
                 train_corpus_path="data/sent.train",
                 test_corpus_path="data/sent.test",
                 weights_file_path="./model/weights_fc.h5"):
        self.nb_classes = nb_classes
        self.dense_dim = dense_dim
        self.max_len = max_len
        self.batch_size = batch_size
        self.epochs = epochs
        self.weights_file_path = weights_file_path
        self.train_corpus_path = train_corpus_path
        self.test_corpus_path = test_corpus_path

        self.nb_samples = 25000  # 样本数
        self.bert_model = BertVector(pooling_strategy="REDUCE_MEAN",
                                     max_seq_len=self.max_len)
    def __init__(self,
                 nb_classes=3,
                 dense_dim=256,
                 max_len=100,
                 batch_size=128,
                 epochs=50,
                 train_corpus_path="data/train.csv",
                 test_corpus_path="data/dev.csv",
                 weights_file_path="./model/weights_fc.h5"):
        self.nb_classes = nb_classes
        self.dense_dim = dense_dim
        self.max_len = max_len
        self.batch_size = batch_size
        self.epochs = epochs
        self.weights_file_path = weights_file_path
        self.train_corpus_path = train_corpus_path
        self.test_corpus_path = test_corpus_path

        self.nb_samples = 17  # 样本数    D:\NLP项目\bert模型\chinese_L-12_H-768_A-12
        self.bert_model = BertVector(pooling_strategy="REDUCE_MEAN",
                                     max_seq_len=self.max_len,
                                     bert_model_path="chinese_L-12_H-768_A-12",
                                     graph_tmpfile="./tmp_graph_xxx")
Пример #6
0
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Input, Dense
from keras.callbacks import EarlyStopping
from att import Attention
from keras.layers import GRU, LSTM, Bidirectional
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt

from load_data import get_train_test_pd
from bert.extract_feature import BertVector


# 读取文件并进行转换
train_df, test_df = get_train_test_pd()
bert_model = BertVector(pooling_strategy="NONE", max_seq_len=80)
print('begin encoding')
f = lambda text: bert_model.encode([text])["encodes"][0]

train_df['x'] = train_df['text'].apply(f)
test_df['x'] = test_df['text'].apply(f)
print('end encoding')

# 训练集和测试集
x_train = np.array([vec for vec in train_df['x']])
x_test = np.array([vec for vec in test_df['x']])
y_train = np.array([vec for vec in train_df['label']])
y_test = np.array([vec for vec in test_df['label']])
# print('x_train: ', x_train.shape)

# 将类型y值转化为ont-hot向量
import pandas as pd
import numpy as np
from bert.extract_feature import BertVector
from keras.models import load_model
from InputPassage_OutputSentence import outfile
load_model = load_model("model/question_sentence_classify_20000.h5")

# 预测语句
f = open(outfile, encoding="UTF-8")
texts = []
for line in f.readlines():
    texts.append(line.strip())
labels = []
true_posibilities = []
bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=70)

# 对上述句子进行预测
for text in texts:
    # 将句子转换成向量
    vec = bert_model.encode([text])["encodes"][0]
    x_train = np.array([vec])
    # 模型预测
    predicted = load_model.predict(x_train)
    # print("predicted:",predicted)
    y = np.argmax(predicted[0])
    print("y:", y)
    label = '1' if y else '0'
    true_posibility = predicted[0][1]
    # if y==1:
    #     posibility = predicted[0][1]
    #     true_posibilities.append(true_posibility)
Пример #8
0
class BertClassification(object):
    def __init__(self,
                 nb_classes=2,
                 gru_dim=128,
                 dense_dim=128,
                 max_len=100,
                 batch_size=128,
                 epochs=10,
                 train_corpus_path="data/sent.train",
                 test_corpus_path="data/sent.test",
                 save_weights_file="./model/weights_lstm.h5"):
        self.nb_classes = nb_classes
        self.gru_dim = gru_dim
        self.dense_dim = dense_dim
        self.max_len = max_len
        self.batch_size = batch_size
        self.epochs = epochs
        self.train_corpus_path=train_corpus_path
        self.test_corpus_path=test_corpus_path
        self.save_weights_file = save_weights_file

        self.nb_samples = 25000 # 样本数
        self.bert_model = BertVector(pooling_strategy="NONE", 
                                     max_seq_len=self.max_len, 
                                     bert_model_path="./chinese_L-12_H-768_A-12/",
280                                  graph_tmpfile="./tmp_graph_xxx)

    def text2bert(self, text):
        """ 将文本转换为bert向量  """
        vec = self.bert_model.encode([text])
        return vec["encodes"][0]

    def data_format(self, lines):
        X, y = [], []
        for line in lines:
            line = line.strip().split("\t")
            label = int(line[0])
            content = line[1]
            vec = self.text2bert(content)
            X.append(vec)
            y.append(label)
        X = np.array(X)
        y = np_utils.to_categorical(np.asarray(y), num_classes=self.nb_classes)
        return X, y

    def data_iter(self):
        """ 数据生成器 """
        fr = codecs.open(self.train_corpus_path, "r", "utf-8")
        lines = fr.readlines()
        fr.close()
        random.shuffle(lines)
        while True:
            for index in range(0, len(lines), self.batch_size):
                batch_samples = lines[index: index+self.batch_size]
                X, y = self.data_format(batch_samples)
                yield (X, y)

    def data_val(self):
        """ 测试数据 """
        fr = codecs.open(self.test_corpus_path, "r", "utf-8")
        lines = fr.readlines()
        fr.close()
        random.shuffle(lines)
        X,y = self.data_format(lines)
        return X,y

    def create_model(self):
        x_in = Input(shape=(self.max_len, 768, ))
        x_out = Masking(mask_value=0.0)(x_in)
        x_out = GRU(self.gru_dim, dropout=0.25, recurrent_dropout=0.25)(x_out)
        x_out = Dense(self.dense_dim, activation="relu")(x_out)
        x_out = BatchNormalization()(x_out)
        x_out = Dense(self.nb_classes, activation="softmax")(x_out)
        model = Model(inputs=x_in, outputs=x_out)
        return model

    def train(self):
        model = self.create_model()
        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(lr=0.001),
                      metrics=['accuracy'])

        checkpoint = ModelCheckpoint(self.save_weights_file, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
        x_test, y_test = self.data_val()
        steps_per_epoch = int(self.nb_samples/self.batch_size)+1
        model.fit_generator(self.data_iter(),
                            steps_per_epoch=steps_per_epoch,
                            epochs=self.epochs,
                            verbose=1,
                            validation_data=(x_test, y_test),
                            validation_steps=None,
                            callbacks=[checkpoint]
                            )
import pickle
# 使用GPU训练
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7,8"

import numpy as np
from load_data import train_df, test_df
from keras.utils import to_categorical
from keras.models import Model
from keras.optimizers import Adam
from keras.layers import Input, BatchNormalization, Dense
from bert.extract_feature import BertVector
train_df.loc[train_df['label'] == '3', ['label']] = '0'
test_df.loc[test_df['label'] == '3', ['label']] = '0'

# 读取文件并进行转换
bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=512)
print('begin encoding')

f = lambda text: bert_model.encode([text])["encodes"][0]
train_df['x'] = train_df['text'].apply(f)
test_df['x'] = test_df['text'].apply(f)

x_train = np.array([vec for vec in train_df['x']])
x_test = np.array([vec for vec in test_df['x']])
y_train = np.array([vec for vec in train_df['label']])
y_test = np.array([vec for vec in test_df['label']])
print('x_train: ', x_train.shape)

# Convert class vectors to binary class matrices.
num_classes = 2
y_train = to_categorical(y_train, num_classes)
Пример #10
0
    test['text'] = test['岗位名称'] + test['岗位职责']
    print('begin encoding')
    f = lambda text: bert_model.encode([text])["encodes"][0]
    test['x'] = test['text'].apply(f)
    print('end encoding')
    pred_data = np.array([_ for _ in test['x']])
    for ind, model in enumerate(models):
        if ind != 3:
            pred = model.predict(pred_data)
            test['pred%d' % (ind)] = pred
        else:
            pred = model.predict(pred_data)
            y = np.argmax(pred, axis=1)
            test['pred%d' % (ind)] = y
    test = test.drop('x', axis=1)
    test.to_csv('result/output.csv', encoding='gbk')


if __name__ == '__main__':
    bert_model = BertVector(pooling_strategy="REDUCE_MEAN", max_seq_len=400)
    #embedding(bert_model, 2) #只需在更换训练数据时执行,得到的词向量将保存在data下,无需重复执行
    data = load_split(2)
    #embedding与load_split的参数与训练数据文件名的末尾数字保持一致
    model1 = skl_precision(svm.SVC, *data, kernel='rbf')
    model2 = skl_precision(es.RandomForestClassifier,
                           *data,
                           max_features='sqrt')
    model3 = skl_precision(nb.GaussianNB, *data)
    model4 = mlp_precision(*data)
    test_model(bert_model, [model1, model2, model3, model4])
Пример #11
0
class BertClassification(object):
    def __init__(self,
                 nb_classes=143,
                 dense_dim=256,
                 max_len=128,
                 batch_size=32,
                 epochs=30,
                 train_corpus_path="data/sent.train",
                 test_corpus_path="data/sent.test",
                 weights_file_path="./model/bertweights_fc.h5"):
        self.nb_classes = nb_classes
        self.dense_dim = dense_dim
        self.max_len = max_len
        self.batch_size = batch_size
        self.epochs = epochs
        self.weights_file_path = weights_file_path
        self.train_corpus_path = train_corpus_path
        self.test_corpus_path = test_corpus_path

        self.nb_samples = 46985  # 样本数
        self.bert_model = BertVector(
            pooling_strategy="REDUCE_MEAN",
            max_seq_len=self.max_len,
            bert_model_path=
            r"D:\赵鲸朋\pycharmModel0905\pycharmModel0905\keras_bert_classification\uncased_L-12_H-768_A-12",
            graph_tmpfile="./data/output/tmp_graph_xxx")

    def text2bert(self, text):
        """ 将文本转换为bert向量  """
        vec = self.bert_model.encode([text])
        return vec["encodes"][0]

    #############################################################################################

    #############################################################################################
    def data_format(self, lines):
        """ 将数据转换为训练格式,输入为列表  """
        X, y = [], []
        for line in lines:
            line = line.strip().split("\t")
            # label = int(line[0])
            label = wosy2_to_id[line[0]]
            content = line[1]
            vec = self.text2bert(content)
            X.append(vec)
            y.append(label)
        X = np.array(X)
        y = np_utils.to_categorical(np.asarray(y), num_classes=self.nb_classes)
        return X, y

    def data_iter(self):
        """ 数据生成器 """
        # fr = codecs.open(self.train_corpus_path, "r", "utf-8")
        # lines = fr.readlines()
        # fr.close()
        # random.shuffle(lines)
        lines = train_labcont
        while True:
            for index in range(0, len(lines), self.batch_size):
                batch_samples = lines[index:index + self.batch_size]
                X, y = self.data_format(batch_samples)
                yield (X, y)

    def data_val(self):
        """ 测试数据 """
        # fr = codecs.open(self.test_corpus_path, "r", "utf-8")
        # lines = fr.readlines()
        # fr.close()
        # random.shuffle(lines)
        lines = test_labcont
        X, y = self.data_format(lines)
        return X, y

    def create_model(self):
        x_in = Input(shape=(768, ))
        # tanh
        x_out = Dense(self.dense_dim, activation="relu")(x_in)
        x_out = BatchNormalization()(x_out)
        x_out = Dense(self.nb_classes, activation="softmax")(x_out)
        model = Model(inputs=x_in, outputs=x_out)
        return model

    def train(self):
        model = self.create_model()
        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(),
                      metrics=['accuracy'])
        model.summary()
        checkpoint = ModelCheckpoint(self.weights_file_path,
                                     monitor='val_acc',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')
        early_stopping = EarlyStopping(monitor='val_acc',
                                       patience=2,
                                       mode='max')
        x_test, y_test = self.data_val()
        model.fit_generator(
            self.data_iter(),
            steps_per_epoch=int(self.nb_samples / self.batch_size) + 1,
            epochs=self.epochs,
            verbose=1,
            validation_data=(x_test, y_test),
            validation_steps=None,
            callbacks=[checkpoint, early_stopping])
Пример #12
0
#coding:utf-8

from bert.extract_feature import BertVector

pooling_strategy = "REDUCE_MEAN"
#pooling_strategy = "NONE"
bc = BertVector(pooling_strategy=pooling_strategy, max_seq_len=80)
s1 = '人 同 去 福田 图书馆 啊 在 家 写 作业 巨 没 feel , 我 的 作业'
s2 = "人同去福田图书馆啊在家写作业巨没feel,我的作业"
v = bc.encode([s1])
v1 = v["encodes"][0]
print(v1)
v = bc.encode([s2])
v2 = v["encodes"][0]
print(v2)
Пример #13
0
class BertClassification(object):
    def __init__(self,
                 nb_classes=3,
                 dense_dim=256,
                 max_len=100,
                 batch_size=128,
                 epochs=50,
                 train_corpus_path="data/train.csv",
                 test_corpus_path="data/dev.csv",
                 weights_file_path="./model/weights_fc.h5"):
        self.nb_classes = nb_classes
        self.dense_dim = dense_dim
        self.max_len = max_len
        self.batch_size = batch_size
        self.epochs = epochs
        self.weights_file_path = weights_file_path
        self.train_corpus_path = train_corpus_path
        self.test_corpus_path = test_corpus_path

        self.nb_samples = 17  # 样本数    D:\NLP项目\bert模型\chinese_L-12_H-768_A-12
        self.bert_model = BertVector(pooling_strategy="REDUCE_MEAN",
                                     max_seq_len=self.max_len,
                                     bert_model_path="chinese_L-12_H-768_A-12",
                                     graph_tmpfile="./tmp_graph_xxx")

    def text2bert(self, text):
        """ 将文本转换为bert向量  """
        vec = self.bert_model.encode([text])
        return vec["encodes"][0]

    def data_format(self, lines):
        """ 将数据转换为训练格式,输入为列表  """
        X, y = [], []
        for line in lines:
            line = line.strip().split(",")
            try:
                label = int(line[4])
                content = line[2]
                vec = self.text2bert(content)
                X.append(vec)
                y.append(label)
            except:
                print(line[0])

        X = np.array(X)
        y = np_utils.to_categorical(np.asarray(y), num_classes=self.nb_classes)
        return X, y

    def data_iter(self):
        """ 数据生成器 """
        fr = codecs.open(self.train_corpus_path, "r", "utf-8")  # 训练集在这里
        lines = fr.readlines()
        fr.close()
        random.shuffle(lines)
        while True:
            for index in range(0, len(lines), self.batch_size):
                batch_samples = lines[index:index + self.batch_size]
                X, y = self.data_format(batch_samples)
                yield (X, y)

    def data_val(self):
        """ 测试数据 """
        fr = codecs.open(self.test_corpus_path, "r", "utf-8")
        lines = fr.readlines()
        fr.close()
        random.shuffle(lines)
        X, y = self.data_format(lines)
        return X, y

    def create_model(self):
        x_in = Input(shape=(768, ))
        x_out = Dense(self.dense_dim, activation="relu")(x_in)
        x_out = BatchNormalization()(x_out)
        x_out = Dense(self.nb_classes, activation="softmax")(x_out)  # 这里是分类
        model = Model(inputs=x_in, outputs=x_out)
        return model

    def train(self):
        model = self.create_model()
        model.compile(loss='categorical_crossentropy',
                      optimizer=Adam(),
                      metrics=['accuracy'])

        checkpoint = ModelCheckpoint(self.weights_file_path,
                                     monitor='val_acc',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')
        x_test, y_test = self.data_val()

        model.fit_generator(
            self.data_iter(),  # 训练集数据迭代器
            steps_per_epoch=int(self.nb_samples / self.batch_size) +
            1,  # batch_size=128
            epochs=self.epochs,
            verbose=1,
            validation_data=(x_test, y_test),
            validation_steps=None,
            callbacks=[checkpoint])
        pred = model.predict(x_test)
        pred = [np.argmax(val) for val in pred]
        print(pred)  # [1, 0, 1, 0, 2, 2, 0, 1, 2, 2, 0, 2, 0, 0, 2, 2, 1]
        y_true = []
        for val in y_test:
            y_true.append(np.argmax(val))
        print(y_true)

        p = precision_score(y_true, pred, average='macro')
        r = recall_score(y_true, pred, average='macro')
        f1 = f1_score(y_true, pred, average='macro')
        print(p)
        print(r)
        print(f1)
Пример #14
0
from bert.extract_feature import BertVector

## 将原始数据集打乱,并分成训练集和验证集(本次实验从原始样本中,选取200个作为训练集,10000个作为验证集)

idxs = np.random.randint(0, len(texts), size=10200)

X = []
y = []
for id in idxs:
    X.append(texts[id])
    y.append(labels[id])

X_VEC = []
## 使用BERT进行向量化
print("star encoding...")
bert_model = BertVector(pooling_strategy="NONE", max_seq_len=100)
for text in X:
    X_VEC.append(bert_model.encode([text])["encodes"][0])

X_VEC_CLS = []
for vec in X_VEC:
    X_VEC_CLS.append(vec[0])

x_train = np.array(X_VEC_CLS[:8000])
x_test = np.array(X_VEC_CLS[8000:])
y_train = np.array(y[:8000])
y_test = np.array(y[8000:])

# 训练集测试集构建完成,开始准备构建模型

print("star training...")
Пример #15
0
        "變種病毒": 100,
        "輝瑞疫苗": 100,
        "蠟筆小新": 100,
        "新型流感": 1,
        "冠狀病毒": 1,
        "武肺": 1,
        "指揮中心": 1,
        "口罩": 1,
        "酷碰券": 100,
        "蔡英文": 100,
        "蔡政府": 100,
    }
    dictionary = construct_dictionary(word_to_weight)

    # load model
    bv = BertVector()
    ws = WS('data')
    pos = POS('data')
    ner = NER('data')

    #sentiment_score
    score_s = sentiment("train_done.csv")

    # max_len
    max_head_len = 150

    # load data
    df_train = cat_replace(pd.read_csv('train_done.csv')).drop_duplicates(
        subset=['claim']).dropna()
    df_test = cat_replace(pd.read_csv('test_done.csv')).drop_duplicates(
        subset=['claim']).dropna()