예제 #1
0
    def train_for_threshold(self, features, target='label', num=35000):
        train_df = self.train_[self.train_.ID < num]
        val_df = self.train_[self.train_.ID >= num]

        X_train, y_train = train_df[features].values, train_df[
            target].values.astype('uint8')
        X_eval, y_eval = val_df[features].values, val_df[target].values.astype(
            'uint8')

        lgb_train = lgb.Dataset(X_train, y_train)
        lgb_eval = lgb.Dataset(X_eval, y_eval, reference=lgb_train)

        lgb_model = lgb.train(self.params,
                              lgb_train,
                              num_boost_round=10000,
                              valid_sets=[lgb_train, lgb_eval],
                              valid_names=['train', 'valid'],
                              early_stopping_rounds=100,
                              verbose_eval=1000)
        y_pred = lgb_model.predict(X_eval)
        ## 获取验证集的真实实体,以及按顺序排序预测的概率和对应的单词
        gt_ent, pred_words, pred_proba = sort_val(val_df, y_pred)
        ## 获取搜索得到的阈值结果
        self.threshold, _ = find_threshold(gt_ent, pred_words, pred_proba)

        return self.threshold
예제 #2
0
    def train_for_threshold(self, features, target='label', num=35000):
        train_df = self.train_[self.train_.ID < num]
        val_df = self.train_[self.train_.ID >= num]

        X_train, y_train = train_df[features].values, train_df[
            target].values.astype('uint8')
        X_eval, y_eval = val_df[features].values, val_df[target].values.astype(
            'uint8')

        xgb_train = xgb.DMatrix(X_train, y_train)
        xgb_eval = xgb.DMatrix(X_eval, y_eval)

        xgb_model = xgb.train(self.params,
                              xgb_train,
                              num_boost_round=1000,
                              evals=[(xgb_train, 'train'), (xgb_eval, 'eval')],
                              early_stopping_rounds=100,
                              verbose_eval=100)
        y_pred = xgb_model.predict(xgb_eval)
        ## 获取验证集的真实实体,以及按顺序排序预测的概率和对应的单词
        gt_ent, pred_words, pred_proba = sort_val(val_df, y_pred)
        ## 获取搜索得到的阈值结果
        self.threshold, _ = find_threshold(gt_ent, pred_words, pred_proba)

        return self.threshold
예제 #3
0
    def save_result(self):
        pred_id, pred_words, pred_proba = sort_val(self.test_,
                                                   self.pred,
                                                   predict=True)
        entities = return_entity(pred_words, pred_proba, self.threshold)

        save_as_order(pred_id, entities, self.opt, 'lgb_result.txt')
예제 #4
0
    def train_for_threshold(self, features, target='label', num=35000):
        train_df = self.train_[self.train_.ID < num]
        val_df = self.train_[self.train_.ID >= num]

        X_train, y_train = train_df[features].values, train_df[target].values.astype('uint8')
        X_eval, y_eval = val_df[features].values, val_df[target].values.astype('uint8')

        cat_train = Pool(X_train, y_train)
        cat_eval = Pool(X_eval, y_eval)

        cat_model = catboost.train(cat_train, self.params, iterations=10000,
                              eval_set=cat_eval,
                              early_stopping_rounds=200,
                              verbose=500)
        y_pred = cat_model.predict(cat_eval, prediction_type='Probability')[:,1]
        ## 获取验证集的真实实体,以及按顺序排序预测的概率和对应的单词
        gt_ent, pred_words, pred_proba = sort_val(val_df, y_pred)
        ## 获取搜索得到的阈值结果
        self.threshold, _ = find_threshold(gt_ent, pred_words, pred_proba)

        return self.threshold