示例#1
0
def trainModel():
    model = CNNLSTMModel()
    model.fit(trainX, trainY, batch_size=16, epochs=5)
    predictions = model.predict(devX)
    count = 0
    for (i, j) in zip(predictions, devY):
        if i == j:
            count += 1
    print(count / len(predictions))
    return predictions
示例#2
0
def test_dataset(model_dir: str) -> list:
	# 从数据库中获取正文并使用模型进行预测分类,
	# 预测结果写回数据库
	conn = pymysql.connect(host=DB_HOST,
                        port=int(DB_PORT),
                        user=DB_USER,
                        password=DB_PASS,
                        db=DB_NAME,
                        charset=DB_CHARSET
                        )
	cursor = conn.cursor()
	cursor.execute("""
		SELECT `page_text`,`page_title`,`category`,`hash` FROM `webpage_text`
		WHERE `%s_predict` IS NULL ORDER BY `time` desc
		""" % model_dir.split('.model')[0].split('/')[-1] 
		)
	all_text = []
	data = cursor.fetchall()
	# 判断预测使用的模型
	if 'cnn.model' in model_dir:
		model = CNNModel.load_model(model_dir)
	elif 'cnnlstm.model' in model_dir:
		model = CNNLSTMModel.load_model(model_dir)
	elif 'blstm.model' in model_dir:
		model = BLSTMModel.load_model(model_dir)
	for i in tqdm.tqdm(data):
		label = i[2]
		# 将文章分词,拼接标题与正文
		content = strip_stopwords(list(jieba.cut(i[0] + '。' + i[1])))
		all_text += content
		predict = model.predict(content)
		cursor.execute(
			'UPDATE `webpage_text` SET {model}_predict="{predict}"'.format(model=model_dir.split('.model')[0].split('/')[-1],predict=predict)+
			'WHERE hash="%s"' % i[3]
			)
		conn.commit()
		# print('[+] Predict:'+predict+', Label:'+label+', Title:'+i[1])

	# 计算词频并将排行前100的热词写入数据库
	c = Counter(all_text)
	i = 1
	cursor.execute(
		'DELETE FROM `hot_key` WHERE 1=1'
		)
	conn.commit()
	for k,v in c.most_common(100):
		if len(k) == 1:
			continue
		cursor.execute(
			'INSERT INTO `hot_key` VALUES ({0}, "{1}", {2})'.format(i, k, v)
			)
		conn.commit()
		i += 1
	print('[+] Success')
    def test_basic_use(self):
        model_folder = '/Users/brikerman/Desktop/nlp/language_models/albert_base'

        checkpoint_path = os.path.join(model_folder, 'model.ckpt-best')
        config_path = os.path.join(model_folder, 'albert_config.json')
        vocab_path = os.path.join(model_folder, 'vocab_chinese.txt')

        tokenizer = BertTokenizer.load_from_vacab_file(vocab_path)
        tokenizer = BertTokenizer.load_from_vocab_file(vocab_path)
        embed = BERTEmbeddingV2(vocab_path,
                                config_path,
                                checkpoint_path,
                                bert_type='albert',
                                task=kashgari.CLASSIFICATION,
                                sequence_length=100)

        sentences = [
            "Jim Henson was a puppeteer.",
            "This here's an example of using the BERT tokenizer.",
            "Why did the chicken cross the road?"
        ]
        labels = ["class1", "class2", "class1"]

        sentences_tokenized = [tokenizer.tokenize(s) for s in sentences]
        print(sentences_tokenized)

        train_x, train_y = sentences_tokenized[:2], labels[:2]
        validate_x, validate_y = sentences_tokenized[2:], labels[2:]

        from kashgari.tasks.classification import CNNLSTMModel
        model = CNNLSTMModel(embed)

        # ------------ build model ------------
        model.fit(train_x,
                  train_y,
                  validate_x,
                  validate_y,
                  epochs=3,
                  batch_size=32)
 def setUpClass(cls):
     cls.epochs = 3
     embedding = EmbeddingManager.get_w2v()
     cls.model = CNNLSTMModel(embedding)
 def setUpClass(cls):
     cls.epochs = 3
     cls.model = CNNLSTMModel()
示例#6
0
feature_list = []
label_list = []
max_len = 0
for i in five_CLS_data:
    feature_list.append(i[0])
    if len(i[0]) > max_len:
        max_len = len(i[0])
    label_list.append(i[1])
tokenizer = BertTokenizer.load_from_vacab_file(vocab_path)
embed = BERTEmbeddingV2(vocab_path,
                        config_path,
                        checkpoint_path,
                        bert_type='electra',
                        task=kashgari.CLASSIFICATION,
                        sequence_length=max_len)

# bert分词
sentences_tokenized = [tokenizer.tokenize(s) for s in feature_list]
print(sentences_tokenized)

train_x, train_y = sentences_tokenized[:2], label_list[:2]
validate_x, validate_y = sentences_tokenized[2:], label_list[2:]

from kashgari.tasks.classification import CNNLSTMModel

model = CNNLSTMModel(embed)

# ------------ build model ------------
model.fit(train_x, train_y, validate_x, validate_y, epochs=3, batch_size=32)