def trainModel(): model = CNNLSTMModel() model.fit(trainX, trainY, batch_size=16, epochs=5) predictions = model.predict(devX) count = 0 for (i, j) in zip(predictions, devY): if i == j: count += 1 print(count / len(predictions)) return predictions
def test_dataset(model_dir: str) -> list: # 从数据库中获取正文并使用模型进行预测分类, # 预测结果写回数据库 conn = pymysql.connect(host=DB_HOST, port=int(DB_PORT), user=DB_USER, password=DB_PASS, db=DB_NAME, charset=DB_CHARSET ) cursor = conn.cursor() cursor.execute(""" SELECT `page_text`,`page_title`,`category`,`hash` FROM `webpage_text` WHERE `%s_predict` IS NULL ORDER BY `time` desc """ % model_dir.split('.model')[0].split('/')[-1] ) all_text = [] data = cursor.fetchall() # 判断预测使用的模型 if 'cnn.model' in model_dir: model = CNNModel.load_model(model_dir) elif 'cnnlstm.model' in model_dir: model = CNNLSTMModel.load_model(model_dir) elif 'blstm.model' in model_dir: model = BLSTMModel.load_model(model_dir) for i in tqdm.tqdm(data): label = i[2] # 将文章分词,拼接标题与正文 content = strip_stopwords(list(jieba.cut(i[0] + '。' + i[1]))) all_text += content predict = model.predict(content) cursor.execute( 'UPDATE `webpage_text` SET {model}_predict="{predict}"'.format(model=model_dir.split('.model')[0].split('/')[-1],predict=predict)+ 'WHERE hash="%s"' % i[3] ) conn.commit() # print('[+] Predict:'+predict+', Label:'+label+', Title:'+i[1]) # 计算词频并将排行前100的热词写入数据库 c = Counter(all_text) i = 1 cursor.execute( 'DELETE FROM `hot_key` WHERE 1=1' ) conn.commit() for k,v in c.most_common(100): if len(k) == 1: continue cursor.execute( 'INSERT INTO `hot_key` VALUES ({0}, "{1}", {2})'.format(i, k, v) ) conn.commit() i += 1 print('[+] Success')
def test_basic_use(self): model_folder = '/Users/brikerman/Desktop/nlp/language_models/albert_base' checkpoint_path = os.path.join(model_folder, 'model.ckpt-best') config_path = os.path.join(model_folder, 'albert_config.json') vocab_path = os.path.join(model_folder, 'vocab_chinese.txt') tokenizer = BertTokenizer.load_from_vacab_file(vocab_path) tokenizer = BertTokenizer.load_from_vocab_file(vocab_path) embed = BERTEmbeddingV2(vocab_path, config_path, checkpoint_path, bert_type='albert', task=kashgari.CLASSIFICATION, sequence_length=100) sentences = [ "Jim Henson was a puppeteer.", "This here's an example of using the BERT tokenizer.", "Why did the chicken cross the road?" ] labels = ["class1", "class2", "class1"] sentences_tokenized = [tokenizer.tokenize(s) for s in sentences] print(sentences_tokenized) train_x, train_y = sentences_tokenized[:2], labels[:2] validate_x, validate_y = sentences_tokenized[2:], labels[2:] from kashgari.tasks.classification import CNNLSTMModel model = CNNLSTMModel(embed) # ------------ build model ------------ model.fit(train_x, train_y, validate_x, validate_y, epochs=3, batch_size=32)
def setUpClass(cls): cls.epochs = 3 embedding = EmbeddingManager.get_w2v() cls.model = CNNLSTMModel(embedding)
def setUpClass(cls): cls.epochs = 3 cls.model = CNNLSTMModel()
feature_list = [] label_list = [] max_len = 0 for i in five_CLS_data: feature_list.append(i[0]) if len(i[0]) > max_len: max_len = len(i[0]) label_list.append(i[1]) tokenizer = BertTokenizer.load_from_vacab_file(vocab_path) embed = BERTEmbeddingV2(vocab_path, config_path, checkpoint_path, bert_type='electra', task=kashgari.CLASSIFICATION, sequence_length=max_len) # bert分词 sentences_tokenized = [tokenizer.tokenize(s) for s in feature_list] print(sentences_tokenized) train_x, train_y = sentences_tokenized[:2], label_list[:2] validate_x, validate_y = sentences_tokenized[2:], label_list[2:] from kashgari.tasks.classification import CNNLSTMModel model = CNNLSTMModel(embed) # ------------ build model ------------ model.fit(train_x, train_y, validate_x, validate_y, epochs=3, batch_size=32)