def train_mentioned_model(train_data, train_segs, validate_data, validate_segs, vectorizer, train_model): model_name = train_model[0] start = train_model[1] end = train_model[2] logger.info("start train %s mentioned", model_name) train_data_size = config.train_data_size sum_label_val = (end - start + 1) * 2 column_list = range(start, end + 1) ori_labels = train_data.iloc[0:train_data_size, column_list] # convert labels , # all the three labels equal -2 means mentioned this item,covert it to 1 # else convert it to 0 train_label = ori_labels.T.sum().abs() // sum_label_val logger.debug("begin to train data") cw = "balanced" mentioned_clf = TextClassifier(vectorizer=vectorizer, class_weight=cw) mentioned_clf.fit(train_segs, train_label) logger.debug("begin to validate %s mentioned model", model_name) # load validate model ori_labels = validate_data.iloc[0:, column_list] validate_labels = ori_labels.T.sum().abs() // sum_label_val y_pre = mentioned_clf.predict(validate_segs) report(validate_labels, y_pre) score = f1_score(validate_labels, y_pre, average="macro") logger.info("validate done! %s mentioned model score:%s", model_name, str(score)) if score > 0.8: logger.info("save %s mentioned model", model_name) model_save_path = config.model_save_path if not os.path.exists(model_save_path): os.makedirs(model_save_path) joblib.dump(mentioned_clf, model_save_path + model_name + "_mentioned.pkl", compress=3) return mentioned_clf
def train_specific_model(train_data): columns = train_data.columns.values.tolist() logger.debug("begin to seg train content") content_segments = seg_words( train_data.content.iloc[0:config.train_data_size]) logger.debug("seg train content done") vectorizer = joblib.load(config.model_save_path + vec_name) logger.debug("load vectorizer") validate_data_df = load_data_from_csv(config.validate_data_path) validata_segs = seg_words(validate_data_df.content) logger.debug("seg validate content") scores = dict() for model_name in columns[:-1]: logger.info("begin to train %s model", model_name) cw = [{ -2: a, -1: b, 0: w, 1: x } for a in range(1, 3) for b in range(5, 8) for w in range(8, 12) for x in range(5, 8)] # cw = {0: 7, 1: 6, -1: 6, -2: 1} positive_clf = TextClassifier(vectorizer=vectorizer, class_weight=cw) y_label = train_data[model_name].iloc[0:config.train_data_size] positive_clf.fit(content_segments, y_label) y_pre = positive_clf.predict(validata_segs) y_true = validate_data_df[model_name].iloc[0:] report(y_true, y_pre) score = f1_score(y_true, y_pre, average="macro") logger.info("score for model:%s is %s ", model_name, str(score)) scores[model_name] = score joblib.dump(positive_clf, config.model_save_path + model_name + ".pkl", compress=True) score = np.mean(list(scores.values())) logger.info("f1_scores: %s" % score)
[train_label6[l] for l in range(len(config.class_group[5]))], Validation_seq, [ Validation_label[config.class_group[5][i]] for i in range(len(config.class_group[5])) ]) # 保存模型 model1.save('6lstm_1.npy') model2.save('6lstm_2.npy') model3.save('6lstm_3.npy') model4.save('6lstm_4.npy') model5.save('6lstm_5.npy') model6.save('6lstm_6.npy') # 评估模型 valid_pred1 = model1.predict(Validation_seq) valid_pred2 = model2.predict(Validation_seq) valid_pred3 = model3.predict(Validation_seq) valid_pred4 = model4.predict(Validation_seq) valid_pred5 = model5.predict(Validation_seq) valid_pred6 = model6.predict(Validation_seq) valid_pred = np.concatenate((valid_pred1, valid_pred2, valid_pred3, valid_pred4, valid_pred5, valid_pred6), axis=0) y_pred = [np.argmax(valid_pred[i], axis=1) for i in range(20)] y_true = [np.argmax(Validation_label[i], axis=1) for i in range(20)] f1 = [ metrics.f1_score(y_true[i], y_pred[i], average='micro') for i in range(len(valid_pred)) ]