Пример #1
0
def predict(X_test: pd.DataFrame, y_test, gbm: lgb.Booster):
    # predict
    pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
    y_pred = []

    for x in pred:
        y_pred.append(np.argmax(x))

    # Print the precision and recall, among other metrics
    print(
        metrics.classification_report(y_test, y_pred, target_names=Categories))
Пример #2
0
def predict(gbm: lgb.Booster, test_data: pd.DataFrame, full_data: pd.DataFrame, feature_names: List[str]):
    last_friday = datetime.now() + relativedelta(weekday=FR(-1))
    date_string = last_friday.strftime('%Y-%m-%d')
    print(date_string)
    live_data = full_data.loc[date_string].copy()
    live_data.dropna(subset=feature_names, inplace=True)
    live_data[PREDICTION_NAME] = gbm.predict(live_data[feature_names])
    test_data[PREDICTION_NAME] = gbm.predict(test_data[feature_names])
    return dict(
        predicted_live_data=live_data,
        predicted_test_data=test_data
    )
class LightgbmOperator(object):
    def __init__(self, bst_path, model_tag):
        """
        初始化
        Args:
            bst_path: 通过model.save()保存的地址
        """
        self.model = Booster(model_file=bst_path)
        self.model_tag = model_tag

    def predict(self, input_datas):
        # if not isinstance(input_datas,list) and not isinstance(input_datas,np.array):
        return self.model.predict(input_datas)
Пример #4
0
    def predict_single_fold(self, model: lgb.Booster,
                            dataset: TabularDataset) -> np.ndarray:
        """Predict target values for dataset.

        Args:
            model: Lightgbm object.
            dataset: test dataset.

        Return:
            predicted target values.

        """
        pred = self.task.losses['lgb'].bw_func(model.predict(dataset.data))

        return pred
Пример #5
0
def predict(
    cv_num: int, sp: Split, model: lgb.Booster, model_number: Optional[int] = None
) -> pd.DataFrame:
    config = Config()
    d_start: int = config.CV_START_DAYS[cv_num]
    d_end: int = config.CV_START_DAYS[cv_num] + 28
    test_pred = sp.test.copy()
    test_pred[config.TARGET + "_true"] = test_pred[config.TARGET]

    test_pred.loc[test_pred.d >= d_start, config.TARGET] = np.nan
    for d in tqdm(range(d_start, d_end)):
        test_pred = make_rolling_for_test(test_pred, d, config.features)
        test_pred.loc[test_pred.d == d, config.TARGET] = model.predict(
            test_pred.loc[test_pred.d == d, config.features]
        )
        test_pred.loc[test_pred.d == d, "sales_is_zero"] = (
            test_pred.loc[test_pred.d == d, "sales"] == 0
        ).astype(np.int8)

    return test_pred
Пример #6
0
def mean_match_function_kdtree_cat(
    mmc,
    model: Booster,
    bachelor_features,
    candidate_values,
    random_state,
    hashed_seeds,
    candidate_preds=None,
):
    """
    This mean matching function selects categorical features by performing nearest
    neighbors on the output class probabilities. This tends to be more accurate, but
    takes more time, especially for variables with large number of classes.

    This function is slower for categorical datatypes, but results in better imputations.

        .. code-block:: text

            Mean match procedure for different datatypes:
                Categorical:
                    If mmc = 0, the class with the highest probability is chosen.
                    If mmc > 0, get N nearest neighbors from class probabilities.
                        Select 1 at random.
                Numeric:
                    If mmc = 0, the predicted value is used
                    If mmc > 0, obtain the mmc closest candidate
                        predictions and collect the associated
                        real candidate values. Choose 1 randomly.

    Parameters
    ----------
    mmc: int
        The number of mean matching candidates (derived from mean_match_candidates parameter)
    model: lgb.Booster
        The model that was trained.
    candidate_features: pd.DataFrame or np.ndarray
        The features used to train the model.
        If mmc == 0, this will be None.
    bachelor_features: pd.DataFrame or np.ndarray
        The features corresponding to the missing values of the response variable used to train
        the model.
    candidate_values:  pd.Series or np.ndarray
        The real (not predicted) values of the candidates from the original dataset.
        Will be 1D
        If the feature is pandas categorical, this will be the category codes.
    random_state: np.random.RandomState
        The random state from the process calling this function is passed.
    hashed_seeds: None, np.ndarray (int32)
        Used to make imputations deterministic at the record level. If this array
        is passed, random_state is ignored in favor of these seeds. These seeds are
        derived as a hash of the random_seed_array passed to the imputation functions.
        The distribution of these seeds is uniform enough.

    Returns
    -------
    The imputation values
    Must be np.ndarray or shape (n,), where n is the length of dimension 1 of bachelor_features.
    If the feature is categorical, return its category code (integer corresponding to its category).

    """

    objective = model.params["objective"]
    assert objective in _REGRESSIVE_OBJECTIVES + _CATEGORICAL_OBJECTIVES, (
        "lightgbm objective not recognized - please check for aliases or " +
        "define a custom mean matching function to handle this objective.")

    # Need these no matter what.
    bachelor_preds = model.predict(bachelor_features)

    if mmc == 0:

        if objective in _REGRESSIVE_OBJECTIVES:

            imp_values = bachelor_preds

        elif objective == "binary":

            imp_values = np.floor(bachelor_preds + 0.5)

        elif objective in ["multiclass", "multiclassova"]:

            imp_values = np.argmax(bachelor_preds, axis=1)

    else:

        if objective in _REGRESSIVE_OBJECTIVES:

            imp_values = _mean_match_reg(
                mmc,
                bachelor_preds,
                candidate_preds,
                candidate_values,
                random_state,
                hashed_seeds,
            )

        elif objective == "binary":

            bachelor_preds = logodds(bachelor_preds)

            imp_values = _mean_match_reg(
                mmc,
                bachelor_preds,
                candidate_preds,
                candidate_values,
                random_state,
                hashed_seeds,
            )

        elif objective in ["multiclass", "multiclassova"]:

            # inner_predict returns a flat array, need to reshape for KDTree
            bachelor_preds = logodds(bachelor_preds)

            imp_values = _mean_match_multiclass_accurate(
                mmc,
                bachelor_preds,
                candidate_preds,
                candidate_values,
                random_state,
                hashed_seeds,
            )

    return imp_values
def predict(
    m_xgb: xgboost.XGBClassifier,
    m_lgbm: lightgbm.Booster,
    test: pd.DataFrame,
    test_previous: pd.DataFrame,
    user_summary: "UserSummary",
    question_features: pd.DataFrame,
) -> Tuple[pd.DataFrame]:
    """
    Predict the probability that the user will answer the current question correctly.

    Parameters
    ----------
    m: The model object, an xgboost classifier.
    test: The test data for which to generate predictions.
    test_previous: The previous group of test data observations, used to update
        user summary statistics.
    user_summary: A UserSummary object containing user features, that can be updated
        with incoming data.
    question_features: Question features to join on content_id.

    Returns
    -------
    A tuple of (prediction dataframe, timer dataframe). The timer dataframe is produced
    to help identify bottlenecks in the prediction pipeline that may cause a timeout
    on Kaggle.
    """
    timer = {}
    if test_previous is not None:
        tic = datetime.utcnow()
        newdata = process_test_observations(test, test_previous,
                                            question_features)
        toc = datetime.utcnow()
        timer["process_test_observations"] = (toc - tic).total_seconds()

        tic = datetime.utcnow()
        user_summary.update(newdata)
        toc = datetime.utcnow()
        timer["update_user_summary"] = (toc - tic).total_seconds()

    test = test.loc[test["content_type_id"] == 0].drop(
        columns="content_type_id")
    tic = datetime.utcnow()
    test = pd.merge(
        test,
        question_features,
        how="left",
        left_on="content_id",
        right_index=True,
        copy=False,
    )
    toc = datetime.utcnow()
    timer["merge_question_features"] = (toc - tic).total_seconds()

    tic = datetime.utcnow()
    required_columns = [
        k for k in constants.USER_SUMMARY_SCHEMA.keys() if k != "user_id"
    ]
    for col in required_columns:
        test[col] = [
            user_summary.get_feature(user_id, col)
            for user_id in test["user_id"]
        ]
    calculate_user_features(test, inplace=True)
    toc = datetime.utcnow()
    timer["merge_user_features"] = (toc - tic).total_seconds()

    tic = datetime.utcnow()
    # test["answered_correctly"] = m_xgb.predict_proba(test[constants.TRAIN_COLS])[:, 1]
    test["answered_correctly"] = m_lgbm.predict(test[constants.TRAIN_COLS])
    toc = datetime.utcnow()
    timer["prediction"] = (toc - tic).total_seconds()

    return test, pd.DataFrame(timer, index=[0])
Пример #8
0
    def model_evaluate(self,
                       dt: pd.DataFrame,
                       prob: float = 0.5,
                       model: lgb.Booster = None):
        """
        Evaluate model on given data frame.

        Produce probability plots, AUC, average PR, F1, Precision, Recall and confusion matrix.

        Args:
            dt: data frame with labels and scores to evaluate
            prob: threshold to count probabilities as ones
            model: model to evaluate
        """
        if not model:
            model = self.lgb_model

        dt_eval = dt
        dt_eval["preds"] = model.predict(dt_eval[model.feature_name()])
        dt_eval["preds"].head()

        sns.distplot(dt_eval["preds"], axlabel='Full distribution')
        plt.show()
        sns.distplot(dt_eval.loc[dt_eval['label'] == 1, "preds"],
                     axlabel='Ones distribution')
        plt.show()
        sns.distplot(dt_eval.loc[dt_eval['label'] == 0, "preds"],
                     axlabel='Zeros distribution')
        plt.show()
        sns.distplot(dt_eval.loc[dt_eval['label'] == 1, "preds"],
                     axlabel='Ones distribution',
                     kde=False)
        sns.distplot(dt_eval.loc[dt_eval['label'] == 0, "preds"],
                     axlabel='Zeros distribution',
                     kde=False)
        plt.show()

        preds = [0 if x < prob else 1 for x in dt_eval["preds"]]
        cm = confusion_matrix(dt_eval['label'].values, preds)
        df_cm = pd.DataFrame(cm)
        sns.heatmap(df_cm, annot=True)
        plt.show()

        a_score = accuracy_score(dt_eval['label'].values,
                                 preds,
                                 normalize=True)
        print("Accuracy score: {}\n".format(a_score))

        class_report = classification_report(dt_eval['label'].values,
                                             preds,
                                             target_names=["Zeros", "Ones"])
        print(class_report)

        total = sum(dt_eval['label'].values)
        predicted = sum(preds)
        print("Total positive labels: {}. Positive labels predicted: {}\n".
              format(total, predicted))

        average_precision = average_precision_score(dt_eval['label'],
                                                    dt_eval['preds'])
        print('Average precision-recall score: {0:0.2f}'.format(
            average_precision))

        precision, recall, _ = precision_recall_curve(dt_eval['label'],
                                                      dt_eval['preds'],
                                                      pos_label=1)

        plt.step(recall, precision, color='b', alpha=0.2, where='post')
        plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.ylim([0.0, 1.05])
        plt.xlim([0.0, 1.0])
        plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format(
            average_precision))
        plt.show()
Пример #9
0
    def predict(booster: lgb.Booster,
                dtest: pd.DataFrame,
                dist: str,
                pred_type: str,
                n_samples: int = 1000,
                quantiles: list = [0.1, 0.5, 0.9],
                seed: str = 123):
        '''A customized lightgbmlss prediction function.

        booster: lgb.Booster
            Trained LightGBMLSS-Model
        X: pd.DataFrame
            Test Data
        dist: str
            Specifies the distributional assumption.
        pred_type: str
            Specifies what is to be predicted:
                "response" draws n_samples from the predicted response distribution.
                "quantile" calculates the quantiles from the predicted response distribution.
                "parameters" returns the predicted distributional parameters.
                "expectiles" returns the predicted expectiles.
        n_samples: int
            If pred_type="response" specifies how many samples are drawn from the predicted response distribution.
        quantiles: list
            If pred_type="quantiles" calculates the quantiles from the predicted response distribution.
        seed: int
            If pred_type="response" specifies the seed for drawing samples from the predicted response distribution.

        '''

        dict_param = dist.param_dict()
        predt = booster.predict(dtest, raw_score=True)

        # Set init_score as starting point for each distributional parameter.
        init_score_pred = (np.ones(shape=(dtest.shape[0],
                                          1))) * dist.start_values

        dist_params_predts = []

        # The prediction result doesn't include the init_score specified in creating the train data.
        # Hence, it needs to be added manually with the corresponding transform for each distributional parameter.
        for i, (dist_param, response_fun) in enumerate(dict_param.items()):
            dist_params_predts.append(
                response_fun(predt[:, i] + init_score_pred[:, i]))

        dist_params_df = pd.DataFrame(dist_params_predts).T
        dist_params_df.columns = dict_param.keys()

        if pred_type == "parameters":
            return dist_params_df

        elif pred_type == "expectiles":
            return dist_params_df

        elif pred_type == "response":
            pred_resp_df = dist.pred_dist_rvs(pred_params=dist_params_df,
                                              n_samples=n_samples,
                                              seed=seed)

            pred_resp_df.columns = [
                str("y_pred_sample_") + str(i)
                for i in range(pred_resp_df.shape[1])
            ]
            return pred_resp_df

        elif pred_type == "quantiles":
            pred_quant_df = dist.pred_dist_quantile(quantiles=quantiles,
                                                    pred_params=dist_params_df)

            pred_quant_df.columns = [
                str("quant_") + str(quantiles[i])
                for i in range(len(quantiles))
            ]
            return pred_quant_df