def test_08_ridge_classifier(self): print("\ntest 08 (Ridge Classifier) [binary-class]\n") X, X_test, y, features, target, test_file = self.data_utility.get_data_for_binary_classification() model = RidgeClassifier() pipeline_obj = Pipeline([ ("model", model) ]) pipeline_obj.fit(X,y) file_name = 'test08sklearn.pmml' skl_to_pmml(pipeline_obj, features, target, file_name) model_name = self.adapa_utility.upload_to_zserver(file_name) predictions, probabilities = self.adapa_utility.score_in_zserver(model_name, test_file) model_pred = pipeline_obj.predict(X_test) model_prob = model._predict_proba_lr(X_test) self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True) self.assertEqual(self.adapa_utility.compare_probability(probabilities, model_prob), True)
index=None, encoding='utf8') print('pac特征已保存\n') ########################### ridge(RidgeClassfiy) ################################ print('RidgeClassfiy stacking') stack_train = np.zeros((len(train), number)) stack_test = np.zeros((len(test), number)) score_va = 0 for i, (tr, va) in enumerate( StratifiedKFold(score, n_folds=n_folds, random_state=1017)): print('stack:%d/%d' % ((i + 1), n_folds)) ridge = RidgeClassifier(random_state=1017) ridge.fit(train_feature[tr], score[tr]) score_va = ridge._predict_proba_lr(train_feature[va]) score_te = ridge._predict_proba_lr(test_feature) print(score_va) print('得分' + str(mean_squared_error(score[va], ridge.predict(train_feature[va])))) stack_train[va] += score_va stack_test += \ score_te stack_test /= n_folds stack = np.vstack([stack_train, stack_test]) df_stack = pd.DataFrame() for i in range(stack.shape[1]): df_stack['tfidf_ridge_classfiy_{}'.format(i)] = np.around(stack[:, i], 6) df_stack.to_csv('feature/tfidf_ridge_2_error_single_classfiy.csv', index=None, encoding='utf8')
### L2 Regularization """ #alpha: Regularization Strength, Larger values specify stronger regularization alphas = np.logspace(10, -3, 1000) y_test.shape #Training Ridge CLassifier on different values of alpha ridge_coefs = [] train_losses = [] test_losses = [] for a in alphas: ridge = RidgeClassifier(alpha=a, fit_intercept=True, normalize=True) ridge.fit(X_train, y_train) train_losses.append(log_loss(y_train, ridge._predict_proba_lr(X_train))) test_losses.append(log_loss(y_test, ridge._predict_proba_lr(X_test))) ridge_coefs.append(ridge.coef_) # Make ridge_coefs numpy array of shape (no_of_alphas,no_of_features) ridge_coefs = np.array(ridge_coefs).reshape((len(alphas), X.shape[1])) # Plot showing how coefficients vary with value of alpha plt.style.use("seaborn") ax = plt.gca() ax.plot(alphas, ridge_coefs) ax.set_xscale('log') plt.xlabel('Value of Lambda') plt.ylabel('Coefficients') plt.axis('tight') plt.show()
train['label1'] = lbl.transform(train['label1'].values) label = train['label1'] num_class = train['label1'].max() + 1 # =======================模型训练:5折交叉验证========================================= n_splits = 5 stack_train = np.zeros((train.shape[0], num_class)) stack_test = np.zeros((test.shape[0], num_class)) for i, (tr, va) in enumerate( StratifiedKFold(n_splits=n_splits, random_state=42).split(trn_term_doc, label)): print('stack:%d/%d' % ((i + 1), n_splits)) ridge = RidgeClassifier(random_state=42) ridge.fit(trn_term_doc[tr], label[tr]) score_va = ridge._predict_proba_lr(trn_term_doc[va]) score_te = ridge._predict_proba_lr(test_term_doc) stack_train[va] += score_va stack_test += score_te print( "model acc_score:", metrics.accuracy_score(label, np.argmax(stack_train, axis=1), normalize=True, sample_weight=None)) # 获取第一第二个标签:取概率最大的前两个即可: m = pd.DataFrame(stack_train) first = []
X_values, y_train, cv=kfold, n_jobs=1, scoring='roc_auc', verbose=0) print('score {:.4}'.format(score.mean())) #score 0.7853 roc auc: 0.783 col_name.startswith('number') or col_name.startswith('dt') or col_name.startswith('onehot') # score 0.781 roc auc: 0.7787 if col_name.startswith('number') # score 0.7832 roc auc: 0.7809 col_name.startswith('number') or col_name.startswith('onehot') # score 0.7854 roc auc: 0.7831 col_name.startswith('number') or col_name.startswith('dt') result = df_test_d[['target']].copy() result['prediction'] = model.predict_proba(X_test)[:, 1] result['prediction'] = model._predict_proba_lr(X_test)[:, 1] metric = roc_auc_score(result['target'], result['prediction']) print('roc auc: {:.4}'.format(metric)) # Обучение result = df_X_d[['target']].copy() result['prediction'] = model.predict_proba(X_values)[:, 1] result['prediction'] = model._predict_proba_lr(X_values)[:, 1] metric = roc_auc_score(result['target'], result['prediction']) print('roc auc: {:.4}'.format(metric)) # # roc auc: 0.7132 # roc auc: 0.9765 result['prediction'].hist(bins=100)
class NewsTFIDFModel(): def __init__( self, train_data, target='LABEL_NEW', tfidf_column='abstract_NEWS_TITLE', tfidf_content='CONTENT', load_model_flag=False, model_root_path='D:/Python_data/My_python/Projects/AIWIN_nlp_news_2021/model_save', model_ds='20210515'): self.train_data = train_data self.tfidf_column = tfidf_column self.tfidf_content = tfidf_content self.model_root_path = model_root_path self.target = target self.model_ds = model_ds self.load_model_flag = load_model_flag if load_model_flag: self.quick_model_load(self.model_ds) else: self.initial_model() def initial_model(self): self.tfidf_title = TfidfVectorizer(ngram_range=(1, 1), min_df=2, max_df=0.95) self.tfidf_content_model = TfidfVectorizer(ngram_range=(1, 3), min_df=2, max_df=0.95) self.clf_sub = LGBMClassifier(reg_lambda=1, reg_alpha=0.8, colsample_bytree=0.7, max_depth=12, num_leaves=20, subsample_freq=5, subsample=0.75, learning_rate=0.014, n_estimators=120) self.clf = RidgeClassifier(alpha=2.2, random_state=V1_SEED) # self.lda = LatentDirichletAllocation(n_components=200, max_iter=100, random_state=V1_SEED) def sentence2vec(self, train_title, train_content): if not self.load_model_flag: self.tfidf_title_fit_model = self.tfidf_title.fit(train_title) self.tfidf_content_fit_model = self.tfidf_content_model.fit( train_title) train_title_tfidf = self.tfidf_title_fit_model.transform(train_title) train_content_tfidf = self.tfidf_content_fit_model.transform( train_content) return train_title_tfidf, train_content_tfidf # return self.lda.fit_transform(train_title_tfidf), self.lda.fit_transform(train_content_tfidf) def sentence_clean(self, df): train_title = df[self.tfidf_column].map( lambda x: struct(x, V1_IGNORE_LIST, True)) train_content = df.loc[:, self.tfidf_content].map( lambda x: struct(x, V1_IGNORE_LIST, True)) return train_title, train_content def train(self): print(self.train_data.columns) print('Clean the data') train_title, train_content = self.sentence_clean(self.train_data) print('word2vec tfidf') train_title_tfidf, train_content_tfidf = self.sentence2vec( train_title, train_content) print('Start split data and train mdoels') # Linear model tr_x, val_x, tr_tfidf, val_tfidf, tr_c_tfidf, val_c_tfidf, tr_y, val_y, tr_original_y, val_original_y = train_test_split( self.train_data[self.tfidf_column], train_title_tfidf, train_content_tfidf, self.train_data[self.target], self.train_data['LABEL_ENCODE'], stratify=self.train_data[self.target], test_size=0.2, random_state=V1_SEED) sub_model_bools = ((tr_original_y == 10) | (tr_original_y == 11)).values tr_tfidf_sub = tr_tfidf[sub_model_bools] tr_c_tfidf_sub = tr_c_tfidf[sub_model_bools] tr_original_y_sub = tr_original_y[sub_model_bools] sub_model_bools = ((val_original_y == 10) | (val_original_y == 11)).values val_tfidf_sub = val_tfidf[sub_model_bools] val_c_tfidf_sub = val_c_tfidf[sub_model_bools] val_original_y_sub = val_original_y[sub_model_bools] # clf_sub title; clf_c_sub content self.clf_c_sub = clone(self.clf_sub) self.clf_sub.fit(tr_tfidf_sub, tr_original_y_sub) self.clf_c_sub.fit(tr_c_tfidf_sub, tr_original_y_sub) te_predict_sub = self.clf_sub.predict(val_tfidf_sub) te_c_predict_sub = self.clf_c_sub.predict(val_c_tfidf_sub) te_f1 = f1_score(val_original_y_sub, te_predict_sub, average='macro') te_c_f1 = f1_score(val_original_y_sub, te_c_predict_sub, average='macro') tr_predict_sub = self.clf_sub.predict(tr_tfidf_sub) tr_c_predict_sub = self.clf_c_sub.predict(tr_c_tfidf_sub) tr_f1 = f1_score(tr_original_y_sub, tr_predict_sub, average='macro') tr_c_f1 = f1_score(tr_original_y_sub, tr_c_predict_sub, average='macro') print( f'TITLE | te_f1: {te_f1:.4f} tr_f1: {tr_f1:.4f} f1_diff: {tr_f1-te_f1:.4f}' ) print( f'CONENT | te_f1: {te_c_f1:.4f} tr_f1: {tr_c_f1:.4f} f1_diff: {tr_c_f1-te_c_f1:.4f}' ) print('--' * 25) self.clf_c = clone(self.clf) self.clf.fit(tr_tfidf, tr_y) self.clf_c.fit(tr_c_tfidf, tr_y) te_predict = self.clf.predict(val_tfidf) te_c_predict = self.clf_c.predict(val_c_tfidf) te_f1 = f1_score(val_y, te_predict, average='macro') te_c_f1 = f1_score(val_y, te_c_predict, average='macro') tr_predict = self.clf.predict(tr_tfidf) tr_c_predict = self.clf_c.predict(tr_c_tfidf) tr_f1 = f1_score(tr_y, tr_predict, average='macro') tr_c_f1 = f1_score(tr_y, tr_c_predict, average='macro') print( f'TITLE | te_f1: {te_f1:.4f} tr_f1: {tr_f1:.4f} f1_diff: {tr_f1-te_f1:.4f}' ) print( f'CONENT | te_f1: {te_c_f1:.4f} tr_f1: {tr_c_f1:.4f} f1_diff: {tr_c_f1-te_c_f1:.4f}' ) self.model_save_all() te_mt = confusion_matrix(val_y, te_predict) te_mt = te_mt / te_mt.sum(axis=1) te_c_mt = confusion_matrix(val_y, te_c_predict) te_c_mt = te_c_mt / te_c_mt.sum(axis=1) # m_pred = np.argmax(0.27 * clf._predict_proba_lr(val_tfidf) + 0.73*clf_c._predict_proba_lr(val_c_tfidf), axis=1) m_pred = np.where( (te_predict == 3) | (te_predict == 6), te_predict, np.argmax(0.27 * self.clf._predict_proba_lr(val_tfidf) + 0.73 * self.clf_c._predict_proba_lr(val_c_tfidf), axis=1)) te_m_mt = confusion_matrix(val_y, m_pred) te_m_mt = te_m_mt / te_m_mt.sum(axis=1) m_pred_final = copy.deepcopy(m_pred) m_pred_final[m_pred_final == 11] += 1 m_pred_final = np.where(m_pred_final == 10, self.clf_c_sub.predict(val_c_tfidf), m_pred_final) print('Final merged model f1_score | ', f1_score(val_original_y, m_pred_final, average='macro')) # 0.9040 # lgb 0.9098254876414202 te_m_mt_f = confusion_matrix(val_original_y, m_pred_final) te_m_mt_f = te_m_mt_f / te_m_mt_f.sum(axis=1) fig, axes = plt.subplots(4, 1, figsize=(10, 15)) sns.heatmap(te_mt, ax=axes[0]) axes[0].set_title( 'TFIDF-linear_title | abstract_NEWS_TITLE - LABEL_NEW ') sns.heatmap(te_c_mt, ax=axes[1]) axes[1].set_title('TFIDF-linear_content | CONTENT - LABEL_NEW ') sns.heatmap(te_m_mt, ax=axes[2]) axes[2].set_title('merged | title(3 & 6) & 0.27*title + 0.73*content') sns.heatmap(te_m_mt_f, ax=axes[3]) axes[3].set_title('Final_model | merged & LGB ') plt.show() def after_predict_rule(self, news_title): """ '公告(系列)', '摘要'结尾 # COMPANY_NM, LABEL return True, '/', '无' """ no_list = ['公告(系列)', '摘要', '结果公示'] if any([news_title.endswith(s) for s in no_list]): return True return False def predict(self, title_content_df, after_fix=True): title_content_df = title_content_df[[ self.tfidf_column, self.tfidf_content ]].copy(deep=True) train_title, train_content = self.sentence_clean(title_content_df) title_tfidf, content_tfidf = self.sentence2vec(train_title, train_content) print(title_tfidf.shape, content_tfidf.shape) te_predict = self.clf.predict(title_tfidf) merge_predict = np.where( (te_predict == 3) | (te_predict == 6), te_predict, np.argmax(0.27 * self.clf._predict_proba_lr(title_tfidf) + 0.73 * self.clf_c._predict_proba_lr(content_tfidf), axis=1)) m_pred_final = copy.deepcopy(merge_predict) m_pred_final[m_pred_final == 11] += 1 m_pred_final = np.where(m_pred_final == 10, self.clf_c_sub.predict(content_tfidf), m_pred_final) if after_fix: five_bool = title_content_df[self.tfidf_column].map( self.after_predict_rule) merge_predict[five_bool] = 5 return merge_predict def model_save(self, model, model_name): if not os.path.exists(self.model_root_path): os.mkdir(self.model_root_path) pickle.dump( model, open(os.path.join(self.model_root_path, f'{model_name}.pkl'), 'wb')) def model_load(self, model_name): if not os.path.exists(self.model_root_path): os.mkdir(self.model_root_path) return pickle.load( open(os.path.join(self.model_root_path, f'{model_name}.pkl'), 'rb')) def quick_model_load(self, ds): self.clf = self.model_load(f'{ds}_title_linear_model') self.clf_c = self.model_load(f'{ds}_content_linear_model') self.clf_c_sub = self.model_load(f'{ds}_content1011_lgb_model') self.tfidf_title_fit_model = self.model_load(f'{ds}_title_tfidf_model') self.tfidf_content_fit_model = self.model_load( f'{ds}_content_tfidf_model') print('Finished load model......') # self.lad = self.model_load(f'{ds}_lda_model') def model_save_all(self): self.model_save(self.clf, f'{self.model_ds}_title_linear_model') self.model_save(self.clf_c, f'{self.model_ds}_content_linear_model') self.model_save(self.clf_c_sub, f'{self.model_ds}_content1011_lgb_model') self.model_save(self.tfidf_title_fit_model, f'{self.model_ds}_title_tfidf_model') self.model_save(self.tfidf_content_fit_model, f'{self.model_ds}_content_tfidf_model')