示例#1
0
 def create_model(self, ):
     self.bert_embedding.processor.add_bos_eos = False
     model = BLSTMModel(embedding=self.bert_embedding)
     model.fit(valid_x, valid_y, epochs=1)
     res = model.predict(valid_x[:20])
     print(res)
     return model
示例#2
0
    def test_classification_eval_callback(self):
        train_x, train_y = SMP2018ECDTCorpus.load_data()
        test_x, test_y = SMP2018ECDTCorpus.load_data('test')

        train_x = train_x[:1000]
        train_y = train_y[:1000]
        model = BLSTMModel()
        eval_callback = callbacks.EvalCallBack(model, test_x, test_y, step=1)
        model.fit(train_x, train_y, callbacks=[eval_callback], epochs=1)
 def test_word2vec_embedding(self):
     embedding = WordEmbeddings('sgns.weibo.bigram',
                                sequence_length=30,
                                limit=5000)
     self.prepare_model(embedding)
     self.model = BLSTMModel(embedding=embedding)
     self.model.fit(self.x_data,
                    self.y_data,
                    x_validate=self.x_eval,
                    y_validate=self.y_eval)
     sentence = list('语言学包含了几种分支领域。')
     logging.info(self.model.embedding.tokenize(sentence))
     logging.info(self.model.predict(sentence))
     self.assertTrue(isinstance(self.model.predict(sentence), str))
     self.assertTrue(isinstance(self.model.predict([sentence]), list))
 def interview(self):
     model = BLSTMModel.load_model("../健康管理师分字BERT-model")
     x_items, train_y = self.read_message('../data/健康管理师分类数据集/test.txt')
     model.evaluate(x_items, train_y)
     results_string: str = ''
     train_string: str = ''
     for i in x_items:
         results_string += model.predict(i)
     for j in train_y:
         train_string += j
     if len(results_string) == len(train_string):
         print('预测结果', results_string, '正确结果', train_string)
         print('五个五个去判断 全等就是做对了不全等就是做错了')
         a = len(train_string)
         b: float = a / 5
         right: int = 0
         for i in range(int(b)):
             var = results_string[5 * i:5 * (i + 1)]
             result = train_string[5 * i:5 * (i + 1)]
             if var == result:
                 print('做对了')
                 right += 1
             else:
                 print('做错了', var, result)
         acc = b - right
         print('正确答案', right, '错误答案', acc)
         print('准确率', right / b)
示例#5
0
 def interview(self):
     model = BLSTMModel.load_model("../西药执业药师-model")
     x_items, train_y = self.read_message('../data/西药执业药师/test.txt')
     model.evaluate(x_items, train_y)
     results_string: str = ''
     train_string: str = ''
     for i in x_items:
         results_string += model.predict(i)
     for j in train_y:
         train_string += j
     if len(results_string) == len(train_string):
         print('五个五个去判断 全等就是做对了不全等就是做错了')
         a = len(train_string)
         b: int = int(a / 5)
         print('验证数据集长度', b)
         right: int = 0
         for i in range(int(b)):
             single = x_items[b * 5:b * 5 + 1]
             var = results_string[5 * i:5 * (i + 1)]
             result = train_string[5 * i:5 * (i + 1)]
             if var == result:
                 print('做对了', single)
                 right += 1
             else:
                 print('做错了', var, result, single)
         acc = b - right
         print('正确答案', right, '错误答案', acc)
         print('准确率', right / b)
    def train(self):
        x_items, train_y, valid_x, valid_y = self.read_message('car/train.csv')
        # 获取bert字向量

        model = BLSTMModel(bert)
        # 输入模型训练数据 标签 步数
        model.fit(x_items,
                  train_y,
                  valid_x,
                  valid_y,
                  batch_size=64,
                  epochs=12,
                  callbacks=[tf_board_callback])
        # 保存模型
        file = pd.read_csv("car/test.csv", encoding='utf-8').values.tolist()
        test_data = []
        id_list = []
        for i in file:
            test_data.append(list(str(i[1]) + str(i[2])))
            id_list.append(i[0])
        predict_answers = model.predict(x_data=test_data)
        file = open("data/test_predict_bert_car.csv", 'w', encoding='utf-8')
        file.write("id,flag\n")
        for i, j in zip(id_list, predict_answers):
            i = i.strip()
            file.write(str(i) + "," + str(j) + "\n")
        model.save("../model/news-classification-bert-model")
 def test_save_and_load(self):
     self.test_fit()
     model_path = tempfile.gettempdir()
     self.model.save(model_path)
     new_model = BLSTMModel.load_model(model_path)
     self.assertIsNotNone(new_model)
     sentence = list('语言学包含了几种分支领域。')
     result = new_model.predict(sentence)
     self.assertTrue(isinstance(result, str))
示例#8
0
 def test_save_and_load(self):
     self.test_fit()
     model_path = os.path.join(tempfile.gettempdir(), 'kashgari_model',
                               str(time.time()))
     self.model.save(model_path)
     new_model = BLSTMModel.load_model(model_path)
     assert new_model is not None
     sentence = list('语言学包含了几种分支领域。')
     result = new_model.predict(sentence)
     assert isinstance(result, str)
    def interview(self):
        model = BLSTMModel.load_model("../健康管理师单选分字BERT-model")
        x_items, train_y = self.read_message('../data/yingyangshi/test.txt')
        x_full = self.full_message('../data/yingyangshi/test.txt')
        model.evaluate(x_items, train_y)
        results_string: str = ''
        train_string: str = ''
        right_predict: list = []
        wrong_predict: list = []
        for i in x_items:
            results_string += model.predict(i)
        for j in train_y:
            train_string += j
        if len(results_string) == len(train_string):
            print('预测结果', results_string, '正确结果', train_string)
            print('五个五个去判断 全等就是做对了不全等就是做错了')
            a = len(train_string)
            b: int = int(a / 5)
            print('验证数据集长度', b)
            right: int = 0
            for i in range(b):
                start_single: int = i * 5
                end_single: int = (i + 1) * 5
                single = x_full[start_single:end_single]
                var = results_string[start_single:end_single]
                result = train_string[start_single:end_single]
                if var == result:
                    print('做对了')
                    right_predict.append(single)
                    for varey in single:
                        print(varey)

                    right += 1
                else:
                    print('做错了', var, result)
                    wrong_predict.append(single)
                    for varey in single:
                        print(varey)
            acc = b - right
            with open('wrong single.csv', 'w', newline='',
                      encoding='utf-8') as csv_file:
                csv_writer = csv.writer(csv_file)
                for wrong_list in wrong_predict:
                    for message in wrong_list:
                        wrong_list = message.split('\t')
                        csv_writer.writerow(wrong_list)
            with open('right single.csv', 'w', newline='',
                      encoding='utf-8') as csv_file:
                csv_writer = csv.writer(csv_file)
                for right_list in right_predict:
                    for message in right_list:
                        message = message.split('\t')
                        csv_writer.writerow(message)
            print('正确答案', right, '错误答案', acc)
            print('准确率', right / b)
示例#10
0
def test_dataset(model_dir: str) -> list:
	# 从数据库中获取正文并使用模型进行预测分类,
	# 预测结果写回数据库
	conn = pymysql.connect(host=DB_HOST,
                        port=int(DB_PORT),
                        user=DB_USER,
                        password=DB_PASS,
                        db=DB_NAME,
                        charset=DB_CHARSET
                        )
	cursor = conn.cursor()
	cursor.execute("""
		SELECT `page_text`,`page_title`,`category`,`hash` FROM `webpage_text`
		WHERE `%s_predict` IS NULL ORDER BY `time` desc
		""" % model_dir.split('.model')[0].split('/')[-1] 
		)
	all_text = []
	data = cursor.fetchall()
	# 判断预测使用的模型
	if 'cnn.model' in model_dir:
		model = CNNModel.load_model(model_dir)
	elif 'cnnlstm.model' in model_dir:
		model = CNNLSTMModel.load_model(model_dir)
	elif 'blstm.model' in model_dir:
		model = BLSTMModel.load_model(model_dir)
	for i in tqdm.tqdm(data):
		label = i[2]
		# 将文章分词,拼接标题与正文
		content = strip_stopwords(list(jieba.cut(i[0] + '。' + i[1])))
		all_text += content
		predict = model.predict(content)
		cursor.execute(
			'UPDATE `webpage_text` SET {model}_predict="{predict}"'.format(model=model_dir.split('.model')[0].split('/')[-1],predict=predict)+
			'WHERE hash="%s"' % i[3]
			)
		conn.commit()
		# print('[+] Predict:'+predict+', Label:'+label+', Title:'+i[1])

	# 计算词频并将排行前100的热词写入数据库
	c = Counter(all_text)
	i = 1
	cursor.execute(
		'DELETE FROM `hot_key` WHERE 1=1'
		)
	conn.commit()
	for k,v in c.most_common(100):
		if len(k) == 1:
			continue
		cursor.execute(
			'INSERT INTO `hot_key` VALUES ({0}, "{1}", {2})'.format(i, k, v)
			)
		conn.commit()
		i += 1
	print('[+] Success')
示例#11
0
    def test_multi_label_model(self):
        multi_label_model = self.model_class(multi_label=True)
        multi_label_model.fit(train_x,
                              train_multi_y,
                              eval_x,
                              eval_multi_y,
                              epochs=2)
        assert isinstance(multi_label_model.predict(train_x[0]), tuple)

        model_path = os.path.join(tempfile.gettempdir(), 'kashgari_model',
                                  str(time.time()))
        multi_label_model.save(model_path)
        new_model = BLSTMModel.load_model(model_path)
        assert new_model is not None
        sentence = list('语言学包含了几种分支领域。')
        result = new_model.predict(sentence)
        assert isinstance(result, tuple)
示例#12
0
    def test_bert_model(self):
        embedding = BERTEmbedding(bert_path,
                                  task=kashgari.CLASSIFICATION,
                                  sequence_length=100)
        model = BLSTMModel(embedding=embedding)
        model.fit(valid_x, valid_y, epochs=1)
        res = model.predict(valid_x[:20])
        assert True

        model_path = os.path.join(tempfile.gettempdir(), str(time.time()))
        model.save(model_path)

        new_model = kashgari.utils.load_model(model_path)
        new_res = new_model.predict(valid_x[:20])
        assert np.array_equal(new_res, res)
示例#13
0
 def interview(self):
     model = BLSTMModel.load_model(
         "../model/health_manager_multi_bert-model")
     x_items, train_y = read_message(
         '../data/health_manager_v2/multiple-choice.csv')
     model.evaluate(x_items, train_y)
     results_string: str = ''
     train_string: str = ''
     right_predict: list = []
     wrong_predict: list = []
     for i in x_items:
         results_string += model.predict(i)
     for j in train_y:
         train_string += j
     if len(results_string) == len(train_string):
         print('预测结果', results_string, '正确结果', train_string)
         print('五个五个去判断 全等就是做对了不全等就是做错了')
         a = len(train_string)
         b: int = int(a / 5)
         print('验证数据集长度', b)
         right: int = 0
         for i in range(b):
             start_single: int = i * 5
             end_single: int = (i + 1) * 5
             var = results_string[start_single:end_single]
             result = train_string[start_single:end_single]
             if var == result:
                 print('做对了')
         acc = b - right
         with open('wrong single.csv', 'w', newline='',
                   encoding='utf-8') as csv_file:
             csv_writer = csv.writer(csv_file)
             for wrong_list in wrong_predict:
                 for message in wrong_list:
                     wrong_list = message.split('\t')
                     csv_writer.writerow(wrong_list)
         with open('right single.csv', 'w', newline='',
                   encoding='utf-8') as csv_file:
             csv_writer = csv.writer(csv_file)
             for right_list in right_predict:
                 for message in right_list:
                     message = message.split('\t')
                     csv_writer.writerow(message)
         print('正确答案', right, '错误答案', acc)
         print('准确率', right / b)
 def train(self):
     # filepath = "saved-model-{epoch:02d}-{acc:.2f}.hdf5"
     # checkpoint_callback = ModelCheckpoint(filepath,
     #                                       monitor='acc',
     #                                       verbose=1)
     x_items, train_y = self.read_message('../data/yingyangshi/train.txt')
     x_dev, dev_y = self.read_message('../data/yingyangshi/dev.txt')
     # 获取bert字向量
     bert = BERTEmbedding('textclassfation/input0/chinese_L-12_H-768_A-12')
     model = BLSTMModel(bert)
     # model.build_multi_gpu_model(gpus=2)
     model.fit(x_items, train_y, x_dev, dev_y, epochs=2, batch_size=64)
     # 保存模型
     model.save("../健康管理师单选分字BERT-model")
    def train(self):
        x_items, train_y = self.read_message('../data/Chinese medicine licensed pharmacist/train.txt')
        x_dev, dev_y = self.read_message('../data/Chinese medicine licensed pharmacist/dev.txt')
        # 获取bert字向量

        model = BLSTMModel()
        # 输入模型训练数据 标签 步数
        model.fit(x_items,
                  train_y,
                  x_dev,
                  dev_y,
                  batch_size=32,
                  epochs=20,
                  fit_kwargs={'callbacks': [tf_board_callback]})
        # 保存模型
        model.save("../model/中医执业药师char-model")
示例#16
0
    def train(self):
        x_xiyao, xiyao_y = self.read_message('../data/西药执业药师/train.txt')

        x_dev, dev_y = self.read_message('../data/西药执业药师/dev.txt')
        # 获取bert字向量
        bert = BERTEmbedding('bert-base-chinese', sequence_length=200)
        model = BLSTMModel(bert)
        # 输入模型训练数据 标签 步数
        model.fit(x_xiyao,
                  xiyao_y,
                  x_dev,
                  dev_y,
                  epochs=8,
                  batch_size=256,
                  fit_kwargs={'callbacks': [tf_board_callback]})
        # 保存模型
        model.save("../西药执业药师-model")
    def train(self):
        x_items, train_y = self.read_message('../data/健康管理师分类数据集/train.txt')
        x_xiyao, xiyao_y = self.read_message('../data/西药执业药师/train.txt')
        x_yingyangshi, yingyangshi_y = self.read_message(
            '../data/yingyangshi/train.txt')
        x_items.extend(x_xiyao)
        train_y.extend(xiyao_y)
        x_items.extend(x_yingyangshi)
        train_y.extend(yingyangshi_y)

        x_dev, dev_y = self.read_message('../data/健康管理师分类数据集/valid.txt')
        # 获取bert字向量
        bert = BERTEmbedding('bert-base-chinese', sequence_length=200)
        model = BLSTMModel(bert)
        # 输入模型训练数据 标签 步数
        model.fit(x_items,
                  train_y,
                  x_dev,
                  dev_y,
                  epochs=8,
                  batch_size=128,
                  fit_kwargs={'callbacks': [tf_board_callback]})
        # 保存模型
        model.save("../健康管理师分字BERT-model")
 def pre_train(self):
     model = BLSTMModel.load_model("../model/中医执业药师classification-model")
     x_items, train_y = self.read_message('../data/Chinese medicine licensed pharmacist/test.txt')
     model.evaluate(x_items, train_y)
示例#19
0
 def pre_evaluate(self):
     model = BLSTMModel.load_model(
         "../model/health_manager_multi_bert-model")
     result = model.predict("")
示例#20
0
 def setUpClass(cls):
     cls.epochs = 3
     cls.model = BLSTMModel()
示例#21
0
 def setUpClass(cls):
     cls.epochs = 3
     embedding = EmbeddingManager.get_w2v()
     cls.model = BLSTMModel(embedding)
class BLSTMModelModelTest(unittest.TestCase):
    def __init__(self, *args, **kwargs):
        super(BLSTMModelModelTest, self).__init__(*args, **kwargs)

        self.__model_class__ = BLSTMModel
        self.x_data = [
            list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
            list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
            list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
            list('语言学包含了几种分支领域。'),
            list('在语言结构(语法)研究与意义(语义与语用)研究之间存在一个重要的主题划分'),
        ]
        self.y_data = ['a', 'a', 'a', 'b', 'c']

        self.x_eval = [
            list('语言学是一门关于人类语言的科学研究。'),
            list('语言学包含了几种分支领域。'),
            list('在语言结构研究与意义研究之间存在一个重要的主题划分。'),
            list('语法中包含了词法,句法以及语音。'),
            list('语音学是语言学的一个相关分支,它涉及到语音与非语音声音的实际属性,以及它们是如何发出与被接收到的。'),
            list('与学习语言不同,语言学是研究所有人类语文发展有关的一门学术科目。'),
        ]

        self.y_eval = ['a', 'a', 'a', 'b', 'c', 'a']

    def prepare_model(self, embedding: BaseEmbedding = None):
        self.model = self.__model_class__(embedding)

    def test_build(self):
        self.prepare_model()
        self.model.fit(self.x_data, self.y_data)
        self.assertEqual(len(self.model.label2idx), 4)
        self.assertGreater(len(self.model.token2idx), 4)
        logging.info(self.model.embedding.token2idx)

    def test_fit(self):
        self.prepare_model()
        self.model.fit(self.x_data,
                       self.y_data,
                       x_validate=self.x_eval,
                       y_validate=self.y_eval)

    def test_label_token_convert(self):
        self.test_fit()
        self.assertTrue(isinstance(self.model.convert_label_to_idx('a'), int))
        self.assertTrue(isinstance(self.model.convert_idx_to_label(1), str))

        self.assertTrue(
            all(
                isinstance(i, int)
                for i in self.model.convert_label_to_idx(['a'])))
        self.assertTrue(
            all(
                isinstance(i, str)
                for i in self.model.convert_idx_to_label([1, 2])))
        sentence = list('在语言结构(语法)研究与意义(语义与语用)研究之间存在一个重要的主题划分')
        tokens = self.model.embedding.tokenize(sentence)
        self.assertEqual(len(sentence) + 2, len(tokens))

    def test_predict(self):
        self.test_fit()
        sentence = list('语言学包含了几种分支领域。')
        self.assertTrue(isinstance(self.model.predict(sentence), str))
        self.assertTrue(isinstance(self.model.predict([sentence]), list))
        logging.info('test predict: {} -> {}'.format(
            sentence, self.model.predict(sentence)))

    def test_eval(self):
        self.test_fit()
        self.model.evaluate(self.x_data, self.y_data)

    def test_bert(self):
        embedding = BERTEmbedding('chinese_L-12_H-768_A-12',
                                  sequence_length=30)
        self.prepare_model(embedding)
        self.model.fit(self.x_data,
                       self.y_data,
                       x_validate=self.x_eval,
                       y_validate=self.y_eval)
        sentence = list('语言学包含了几种分支领域。')
        logging.info(self.model.embedding.tokenize(sentence))
        logging.info(self.model.predict(sentence))
        self.assertTrue(isinstance(self.model.predict(sentence), str))
        self.assertTrue(isinstance(self.model.predict([sentence]), list))

    def test_word2vec_embedding(self):
        embedding = WordEmbeddings('sgns.weibo.bigram',
                                   sequence_length=30,
                                   limit=5000)
        self.prepare_model(embedding)
        self.model = BLSTMModel(embedding=embedding)
        self.model.fit(self.x_data,
                       self.y_data,
                       x_validate=self.x_eval,
                       y_validate=self.y_eval)
        sentence = list('语言学包含了几种分支领域。')
        logging.info(self.model.embedding.tokenize(sentence))
        logging.info(self.model.predict(sentence))
        self.assertTrue(isinstance(self.model.predict(sentence), str))
        self.assertTrue(isinstance(self.model.predict([sentence]), list))

    def test_save_and_load(self):
        self.test_fit()
        model_path = tempfile.gettempdir()
        self.model.save(model_path)
        new_model = BLSTMModel.load_model(model_path)
        self.assertIsNotNone(new_model)
        sentence = list('语言学包含了几种分支领域。')
        result = new_model.predict(sentence)
        self.assertTrue(isinstance(result, str))
示例#23
0
log_filepath = r"D:\data\biendata\ccks2019_el\kashgari\m0log"

early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=2)
# early_stop = EarlyStopping(monitor="val_acc", mode="max", patience=2)

log = TensorBoard(log_dir=log_filepath,
                  write_images=False,
                  write_graph=True,
                  histogram_freq=0)

# emn_path = r'D:\data\bert\chinese_L-12_H-768_A-12'
emn_path = r'D:\data\bert\chinese-bert_chinese_wwm_L-12_H-768_A-12'
embedding = BERTEmbedding(emn_path, sequence_length=512)

# model = DropoutBGRUModel(embedding)
model = BLSTMModel(embedding)
model.build_model(x_train, y_train, x_validate, y_validate)

model.model.fit_generator()
model.fit(train_x,
          train_y,
          x_validate=validate_x,
          y_validate=validate_y,
          epochs=20,
          batch_size=128,
          labels_weight=True,
          fit_kwargs={'callbacks': [early_stop, log]})

model.evaluate(test_x, test_y)

model.save(model_path)
 def pre_evaluate(self):
     model = BLSTMModel.load_model("../健康管理师分字-model")
     x_items, train_y = self.read_message(
         '../data/health_manager_v4/test.txt')
     model.evaluate(x_items, train_y)
示例#25
0
def predict_each_line(args, model):
    import codecs
    fout = codecs.open(args.output_file, 'w')
    test_x, test_y = fetch_data_set(args.test_set_path)
    for line, y in zip(test_x, test_y):
       result = model.predict(text_processor(''.join(line)), batch_size=1, debug_info=False)
       if result != ''.join(y):
           str_message = ''.join(line) + "\t" + ''.join(y) +"\t" + result
           print(str_message)
           fout.write(str_message+'\n')
    fout.close()

if __name__ == '__main__':
    # initialize parameter
    args = params_setup()
    logging.basicConfig(filename=args.log_path, level=logging.DEBUG)

    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id

    bert_embedding = BERTEmbedding('bert-base-chinese', sequence_length=30)

    model = BLSTMModel(bert_embedding)
    model = model.load_model(args.model_path)

    if (args.predict_mode == "from_input"):
        predict_from_user_input(model)
    else:
        predict_from_test_set(args, model)

示例#26
0
log_filepath = r"D:\data\biendata\ccks2019_el\clf_log"

early_stop = EarlyStopping(monitor="val_loss", mode="min", patience=2)
# early_stop = EarlyStopping(monitor="val_acc", mode="max", patience=2)

log = TensorBoard(log_dir=log_filepath,
                  write_images=False,
                  write_graph=True,
                  histogram_freq=0)

emn_path = r'D:\data\bert\chinese_L-12_H-768_A-12'
embedding = BERTEmbedding(emn_path, sequence_length=1024)
emn_path = r'D:/data/word2vec/zh/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5/sgns.target.word-word.dynwin5.thr10.neg5.dim300.iter5.utf8.txt'
embedding = WordEmbeddings(emn_path, sequence_length=1024)

# model = DropoutBGRUModel(embedding)
model = BLSTMModel(embedding)

model.fit(train_x[:100000],
          train_y[:100000],
          x_validate=validate_x[:20000],
          y_validate=validate_y[:20000],
          epochs=20,
          batch_size=256,
          labels_weight=True,
          fit_kwargs={'callbacks': [early_stop, log]})

model.evaluate(test_x, test_y)

model.save(model_path)
 def pre_train(self):
     bilstm_model = BLSTMModel.load_model('../classification-model')
     x_items, _ = self.read_message('../data/西药执业药师/dev.txt')
     for i in x_items:
         result = bilstm_model.predict(i)
         print("\n" + result)