Пример #1
0
    def test_08_ridge_classifier(self):
        print("\ntest 08 (Ridge Classifier) [binary-class]\n")
        X, X_test, y, features, target, test_file = self.data_utility.get_data_for_binary_classification()

        model = RidgeClassifier()
        pipeline_obj = Pipeline([
            ("model", model)
        ])
        pipeline_obj.fit(X,y)
        file_name = 'test08sklearn.pmml'
        
        skl_to_pmml(pipeline_obj, features, target, file_name)
        model_name  = self.adapa_utility.upload_to_zserver(file_name)
        predictions, probabilities = self.adapa_utility.score_in_zserver(model_name, test_file)
        model_pred = pipeline_obj.predict(X_test)
        model_prob = model._predict_proba_lr(X_test)
        self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)
        self.assertEqual(self.adapa_utility.compare_probability(probabilities, model_prob), True)
                index=None,
                encoding='utf8')
print('pac特征已保存\n')

########################### ridge(RidgeClassfiy) ################################
print('RidgeClassfiy stacking')
stack_train = np.zeros((len(train), number))
stack_test = np.zeros((len(test), number))
score_va = 0

for i, (tr, va) in enumerate(
        StratifiedKFold(score, n_folds=n_folds, random_state=1017)):
    print('stack:%d/%d' % ((i + 1), n_folds))
    ridge = RidgeClassifier(random_state=1017)
    ridge.fit(train_feature[tr], score[tr])
    score_va = ridge._predict_proba_lr(train_feature[va])
    score_te = ridge._predict_proba_lr(test_feature)
    print(score_va)
    print('得分' +
          str(mean_squared_error(score[va], ridge.predict(train_feature[va]))))
    stack_train[va] += score_va
    stack_test +=                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      \
        score_te
stack_test /= n_folds
stack = np.vstack([stack_train, stack_test])
df_stack = pd.DataFrame()
for i in range(stack.shape[1]):
    df_stack['tfidf_ridge_classfiy_{}'.format(i)] = np.around(stack[:, i], 6)
df_stack.to_csv('feature/tfidf_ridge_2_error_single_classfiy.csv',
                index=None,
                encoding='utf8')
Пример #3
0
### L2 Regularization
"""

#alpha: Regularization Strength, Larger values specify stronger regularization
alphas = np.logspace(10, -3, 1000)

y_test.shape

#Training Ridge CLassifier on different values of alpha
ridge_coefs = []
train_losses = []
test_losses = []
for a in alphas:
    ridge = RidgeClassifier(alpha=a, fit_intercept=True, normalize=True)
    ridge.fit(X_train, y_train)
    train_losses.append(log_loss(y_train, ridge._predict_proba_lr(X_train)))
    test_losses.append(log_loss(y_test, ridge._predict_proba_lr(X_test)))
    ridge_coefs.append(ridge.coef_)

# Make ridge_coefs numpy array of shape (no_of_alphas,no_of_features)
ridge_coefs = np.array(ridge_coefs).reshape((len(alphas), X.shape[1]))

# Plot showing how coefficients vary with value of alpha
plt.style.use("seaborn")
ax = plt.gca()
ax.plot(alphas, ridge_coefs)
ax.set_xscale('log')
plt.xlabel('Value of Lambda')
plt.ylabel('Coefficients')
plt.axis('tight')
plt.show()
Пример #4
0
train['label1'] = lbl.transform(train['label1'].values)
label = train['label1']
num_class = train['label1'].max() + 1

# =======================模型训练:5折交叉验证=========================================
n_splits = 5
stack_train = np.zeros((train.shape[0], num_class))
stack_test = np.zeros((test.shape[0], num_class))
for i, (tr, va) in enumerate(
        StratifiedKFold(n_splits=n_splits,
                        random_state=42).split(trn_term_doc, label)):
    print('stack:%d/%d' % ((i + 1), n_splits))

    ridge = RidgeClassifier(random_state=42)
    ridge.fit(trn_term_doc[tr], label[tr])
    score_va = ridge._predict_proba_lr(trn_term_doc[va])
    score_te = ridge._predict_proba_lr(test_term_doc)

    stack_train[va] += score_va
    stack_test += score_te

print(
    "model acc_score:",
    metrics.accuracy_score(label,
                           np.argmax(stack_train, axis=1),
                           normalize=True,
                           sample_weight=None))

# 获取第一第二个标签:取概率最大的前两个即可:
m = pd.DataFrame(stack_train)
first = []
Пример #5
0
                        X_values,
                        y_train,
                        cv=kfold,
                        n_jobs=1,
                        scoring='roc_auc',
                        verbose=0)

print('score {:.4}'.format(score.mean()))
#score 0.7853 roc auc: 0.783 col_name.startswith('number') or col_name.startswith('dt') or col_name.startswith('onehot')
# score 0.781 roc auc: 0.7787 if col_name.startswith('number')
# score 0.7832  roc auc: 0.7809  col_name.startswith('number') or col_name.startswith('onehot')
# score 0.7854 roc auc: 0.7831 col_name.startswith('number') or col_name.startswith('dt')

result = df_test_d[['target']].copy()
result['prediction'] = model.predict_proba(X_test)[:, 1]
result['prediction'] = model._predict_proba_lr(X_test)[:, 1]

metric = roc_auc_score(result['target'], result['prediction'])
print('roc auc: {:.4}'.format(metric))
# Обучение
result = df_X_d[['target']].copy()
result['prediction'] = model.predict_proba(X_values)[:, 1]
result['prediction'] = model._predict_proba_lr(X_values)[:, 1]

metric = roc_auc_score(result['target'], result['prediction'])
print('roc auc: {:.4}'.format(metric))
#
# roc auc: 0.7132
# roc auc: 0.9765

result['prediction'].hist(bins=100)
Пример #6
0
class NewsTFIDFModel():
    def __init__(
            self,
            train_data,
            target='LABEL_NEW',
            tfidf_column='abstract_NEWS_TITLE',
            tfidf_content='CONTENT',
            load_model_flag=False,
            model_root_path='D:/Python_data/My_python/Projects/AIWIN_nlp_news_2021/model_save',
            model_ds='20210515'):
        self.train_data = train_data
        self.tfidf_column = tfidf_column
        self.tfidf_content = tfidf_content
        self.model_root_path = model_root_path
        self.target = target
        self.model_ds = model_ds
        self.load_model_flag = load_model_flag

        if load_model_flag:
            self.quick_model_load(self.model_ds)
        else:
            self.initial_model()

    def initial_model(self):
        self.tfidf_title = TfidfVectorizer(ngram_range=(1, 1),
                                           min_df=2,
                                           max_df=0.95)
        self.tfidf_content_model = TfidfVectorizer(ngram_range=(1, 3),
                                                   min_df=2,
                                                   max_df=0.95)
        self.clf_sub = LGBMClassifier(reg_lambda=1,
                                      reg_alpha=0.8,
                                      colsample_bytree=0.7,
                                      max_depth=12,
                                      num_leaves=20,
                                      subsample_freq=5,
                                      subsample=0.75,
                                      learning_rate=0.014,
                                      n_estimators=120)
        self.clf = RidgeClassifier(alpha=2.2, random_state=V1_SEED)
        # self.lda  = LatentDirichletAllocation(n_components=200, max_iter=100, random_state=V1_SEED)

    def sentence2vec(self, train_title, train_content):
        if not self.load_model_flag:
            self.tfidf_title_fit_model = self.tfidf_title.fit(train_title)
            self.tfidf_content_fit_model = self.tfidf_content_model.fit(
                train_title)

        train_title_tfidf = self.tfidf_title_fit_model.transform(train_title)
        train_content_tfidf = self.tfidf_content_fit_model.transform(
            train_content)
        return train_title_tfidf, train_content_tfidf  # return self.lda.fit_transform(train_title_tfidf), self.lda.fit_transform(train_content_tfidf)

    def sentence_clean(self, df):
        train_title = df[self.tfidf_column].map(
            lambda x: struct(x, V1_IGNORE_LIST, True))
        train_content = df.loc[:, self.tfidf_content].map(
            lambda x: struct(x, V1_IGNORE_LIST, True))
        return train_title, train_content

    def train(self):
        print(self.train_data.columns)
        print('Clean the data')
        train_title, train_content = self.sentence_clean(self.train_data)
        print('word2vec tfidf')
        train_title_tfidf, train_content_tfidf = self.sentence2vec(
            train_title, train_content)

        print('Start split data and train mdoels')
        # Linear model
        tr_x, val_x, tr_tfidf, val_tfidf, tr_c_tfidf, val_c_tfidf, tr_y, val_y, tr_original_y, val_original_y = train_test_split(
            self.train_data[self.tfidf_column],
            train_title_tfidf,
            train_content_tfidf,
            self.train_data[self.target],
            self.train_data['LABEL_ENCODE'],
            stratify=self.train_data[self.target],
            test_size=0.2,
            random_state=V1_SEED)

        sub_model_bools = ((tr_original_y == 10) |
                           (tr_original_y == 11)).values
        tr_tfidf_sub = tr_tfidf[sub_model_bools]
        tr_c_tfidf_sub = tr_c_tfidf[sub_model_bools]
        tr_original_y_sub = tr_original_y[sub_model_bools]

        sub_model_bools = ((val_original_y == 10) |
                           (val_original_y == 11)).values
        val_tfidf_sub = val_tfidf[sub_model_bools]
        val_c_tfidf_sub = val_c_tfidf[sub_model_bools]
        val_original_y_sub = val_original_y[sub_model_bools]

        # clf_sub title; clf_c_sub content
        self.clf_c_sub = clone(self.clf_sub)
        self.clf_sub.fit(tr_tfidf_sub, tr_original_y_sub)
        self.clf_c_sub.fit(tr_c_tfidf_sub, tr_original_y_sub)

        te_predict_sub = self.clf_sub.predict(val_tfidf_sub)
        te_c_predict_sub = self.clf_c_sub.predict(val_c_tfidf_sub)

        te_f1 = f1_score(val_original_y_sub, te_predict_sub, average='macro')
        te_c_f1 = f1_score(val_original_y_sub,
                           te_c_predict_sub,
                           average='macro')
        tr_predict_sub = self.clf_sub.predict(tr_tfidf_sub)
        tr_c_predict_sub = self.clf_c_sub.predict(tr_c_tfidf_sub)
        tr_f1 = f1_score(tr_original_y_sub, tr_predict_sub, average='macro')
        tr_c_f1 = f1_score(tr_original_y_sub,
                           tr_c_predict_sub,
                           average='macro')
        print(
            f'TITLE  | te_f1: {te_f1:.4f} tr_f1: {tr_f1:.4f} f1_diff: {tr_f1-te_f1:.4f}'
        )
        print(
            f'CONENT | te_f1: {te_c_f1:.4f} tr_f1: {tr_c_f1:.4f} f1_diff: {tr_c_f1-te_c_f1:.4f}'
        )

        print('--' * 25)
        self.clf_c = clone(self.clf)
        self.clf.fit(tr_tfidf, tr_y)
        self.clf_c.fit(tr_c_tfidf, tr_y)

        te_predict = self.clf.predict(val_tfidf)
        te_c_predict = self.clf_c.predict(val_c_tfidf)

        te_f1 = f1_score(val_y, te_predict, average='macro')
        te_c_f1 = f1_score(val_y, te_c_predict, average='macro')
        tr_predict = self.clf.predict(tr_tfidf)
        tr_c_predict = self.clf_c.predict(tr_c_tfidf)
        tr_f1 = f1_score(tr_y, tr_predict, average='macro')
        tr_c_f1 = f1_score(tr_y, tr_c_predict, average='macro')
        print(
            f'TITLE    | te_f1: {te_f1:.4f} tr_f1: {tr_f1:.4f} f1_diff: {tr_f1-te_f1:.4f}'
        )
        print(
            f'CONENT   | te_f1: {te_c_f1:.4f} tr_f1: {tr_c_f1:.4f} f1_diff: {tr_c_f1-te_c_f1:.4f}'
        )

        self.model_save_all()

        te_mt = confusion_matrix(val_y, te_predict)
        te_mt = te_mt / te_mt.sum(axis=1)
        te_c_mt = confusion_matrix(val_y, te_c_predict)
        te_c_mt = te_c_mt / te_c_mt.sum(axis=1)

        # m_pred = np.argmax(0.27 * clf._predict_proba_lr(val_tfidf) +  0.73*clf_c._predict_proba_lr(val_c_tfidf), axis=1)

        m_pred = np.where(
            (te_predict == 3) | (te_predict == 6), te_predict,
            np.argmax(0.27 * self.clf._predict_proba_lr(val_tfidf) +
                      0.73 * self.clf_c._predict_proba_lr(val_c_tfidf),
                      axis=1))

        te_m_mt = confusion_matrix(val_y, m_pred)
        te_m_mt = te_m_mt / te_m_mt.sum(axis=1)

        m_pred_final = copy.deepcopy(m_pred)
        m_pred_final[m_pred_final == 11] += 1
        m_pred_final = np.where(m_pred_final == 10,
                                self.clf_c_sub.predict(val_c_tfidf),
                                m_pred_final)

        print('Final merged model f1_score | ',
              f1_score(val_original_y, m_pred_final,
                       average='macro'))  # 0.9040 # lgb 0.9098254876414202

        te_m_mt_f = confusion_matrix(val_original_y, m_pred_final)
        te_m_mt_f = te_m_mt_f / te_m_mt_f.sum(axis=1)

        fig, axes = plt.subplots(4, 1, figsize=(10, 15))
        sns.heatmap(te_mt, ax=axes[0])
        axes[0].set_title(
            'TFIDF-linear_title | abstract_NEWS_TITLE - LABEL_NEW ')
        sns.heatmap(te_c_mt, ax=axes[1])
        axes[1].set_title('TFIDF-linear_content | CONTENT - LABEL_NEW ')
        sns.heatmap(te_m_mt, ax=axes[2])
        axes[2].set_title('merged | title(3 & 6) &  0.27*title + 0.73*content')
        sns.heatmap(te_m_mt_f, ax=axes[3])
        axes[3].set_title('Final_model | merged & LGB ')
        plt.show()

    def after_predict_rule(self, news_title):
        """
        '公告(系列)', '摘要'结尾
        # COMPANY_NM, LABEL
        return True, '/', '无'
        """
        no_list = ['公告(系列)', '摘要', '结果公示']
        if any([news_title.endswith(s) for s in no_list]):
            return True
        return False

    def predict(self, title_content_df, after_fix=True):
        title_content_df = title_content_df[[
            self.tfidf_column, self.tfidf_content
        ]].copy(deep=True)
        train_title, train_content = self.sentence_clean(title_content_df)
        title_tfidf, content_tfidf = self.sentence2vec(train_title,
                                                       train_content)
        print(title_tfidf.shape, content_tfidf.shape)

        te_predict = self.clf.predict(title_tfidf)

        merge_predict = np.where(
            (te_predict == 3) | (te_predict == 6), te_predict,
            np.argmax(0.27 * self.clf._predict_proba_lr(title_tfidf) +
                      0.73 * self.clf_c._predict_proba_lr(content_tfidf),
                      axis=1))

        m_pred_final = copy.deepcopy(merge_predict)
        m_pred_final[m_pred_final == 11] += 1
        m_pred_final = np.where(m_pred_final == 10,
                                self.clf_c_sub.predict(content_tfidf),
                                m_pred_final)

        if after_fix:
            five_bool = title_content_df[self.tfidf_column].map(
                self.after_predict_rule)
            merge_predict[five_bool] = 5
        return merge_predict

    def model_save(self, model, model_name):
        if not os.path.exists(self.model_root_path):
            os.mkdir(self.model_root_path)
        pickle.dump(
            model,
            open(os.path.join(self.model_root_path, f'{model_name}.pkl'),
                 'wb'))

    def model_load(self, model_name):
        if not os.path.exists(self.model_root_path):
            os.mkdir(self.model_root_path)
        return pickle.load(
            open(os.path.join(self.model_root_path, f'{model_name}.pkl'),
                 'rb'))

    def quick_model_load(self, ds):
        self.clf = self.model_load(f'{ds}_title_linear_model')
        self.clf_c = self.model_load(f'{ds}_content_linear_model')
        self.clf_c_sub = self.model_load(f'{ds}_content1011_lgb_model')
        self.tfidf_title_fit_model = self.model_load(f'{ds}_title_tfidf_model')
        self.tfidf_content_fit_model = self.model_load(
            f'{ds}_content_tfidf_model')
        print('Finished load model......')
        # self.lad = self.model_load(f'{ds}_lda_model')

    def model_save_all(self):
        self.model_save(self.clf, f'{self.model_ds}_title_linear_model')
        self.model_save(self.clf_c, f'{self.model_ds}_content_linear_model')
        self.model_save(self.clf_c_sub,
                        f'{self.model_ds}_content1011_lgb_model')
        self.model_save(self.tfidf_title_fit_model,
                        f'{self.model_ds}_title_tfidf_model')
        self.model_save(self.tfidf_content_fit_model,
                        f'{self.model_ds}_content_tfidf_model')