def __init__(self, bst_path, model_tag): """ 初始化 Args: bst_path: 通过model.save()保存的地址 """ self.model = Booster(model_file=bst_path) self.model_tag = model_tag
def _load(properties): """Load a LGBMExplainableModel from the given properties. :param properties: A serialized dictionary representation of the LGBMExplainableModel. :type properties: dict :return: The deserialized LGBMExplainableModel. :rtype: azureml.explain.model.mimic.models.LGBMExplainableModel """ # create the LGBMExplainableModel without any properties using the __new__ function, similar to pickle lightgbm = LGBMExplainableModel.__new__(LGBMExplainableModel) # Get _n_features _n_features = properties.pop(_N_FEATURES) # If classification case get _n_classes if json.loads(properties[LightGBMSerializationConstants.MULTICLASS]): _n_classes = properties.pop(_N_CLASSES) # load all of the properties for key, value in properties.items(): # Regenerate the properties on the fly if key in LightGBMSerializationConstants.nonify_properties: if key == LightGBMSerializationConstants.LOGGER: parent = logging.getLogger(__name__) lightgbm_identity = json.loads(properties[LightGBMSerializationConstants.IDENTITY]) lightgbm.__dict__[key] = parent.getChild(lightgbm_identity) elif key == LightGBMSerializationConstants.TREE_EXPLAINER: lightgbm.__dict__[key] = None else: raise Exception("Unknown nonify key on deserialize in LightGBMExplainableModel: {}".format(key)) elif key in LightGBMSerializationConstants.save_properties: # Load the booster from file and re-create the LGBMClassifier or LGBMRegressor # This is not recommended but can be necessary to get around pickle being not secure # See here for more info: # https://github.com/Microsoft/LightGBM/issues/1942 # https://github.com/Microsoft/LightGBM/issues/1217 if json.loads(properties[LightGBMSerializationConstants.MULTICLASS]): new_lgbm = LGBMClassifier() lgbm_booster = Booster(params={LightGBMSerializationConstants.MODEL_STR: value}) new_lgbm._Booster = lgbm_booster new_lgbm._n_classes = _n_classes else: new_lgbm = LGBMRegressor() lgbm_booster = Booster(params={LightGBMSerializationConstants.MODEL_STR: value}) new_lgbm._Booster = lgbm_booster new_lgbm._n_features = _n_features lightgbm.__dict__[key] = new_lgbm elif key in LightGBMSerializationConstants.enum_properties: # NOTE: If more enums added in future, will need to handle this differently lightgbm.__dict__[key] = ShapValuesOutput(json.loads(value)) else: lightgbm.__dict__[key] = json.loads(value) return lightgbm
def test_quant_data_wrapper(input_file_numbers, model, normalize=True, predict_iteration=None): """ input:(file_names,model) output: mean rank rate """ mean_rank_rates = [] file_number_list = [] if predict_iteration: model.save_model('tmp_model.txt', num_iteration=predict_iteration) model = Booster(model_file='tmp_model.txt') for i in input_file_numbers: data_root_path = '%s/datas/Quant_Datas_v3.0' % (DATA_ROOT) if normalize: fin_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, i) else: fin_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i) try: with open(fin_path, 'rb') as fin_data_file: stock_ids, stock_scores, vec_values = cPickle.load(fin_data_file) data_process_logger.info('testing file: %s' % fin_path) input_datas = np.column_stack((stock_ids, stock_scores, vec_values)) mean_rank_rate = test_datas(input_datas, model) if mean_rank_rate >= 0.4: data_analysis_logger.info('the file number is %s, obs = %s' % (i, len(input_datas))) mean_rank_rates.append(mean_rank_rate) file_number_list.append(i) except Exception, e: data_process_logger.info('test file failed: file path=%s, details=%s' % (fin_path, e))
def multi_classifier(self): """An instance of pre-trained multi-temporal cloud classifier. Loaded only the first time it is required.""" if self._multi_classifier is None: path = os.path.join(self.MODELS_FOLDER, self.MULTI_CLASSIFIER_NAME) self._multi_classifier = Booster(model_file=path) return self._multi_classifier
def load_lgbm_model(fname): """ Load a LightGBM model that was saved as a file with the HyperLGBMClassifier.save method. The model is span on two files: * The first file contains the model saved with the Booster class, this file have no extension. * The second file contains the parameters used to create the model, this file have the extension '.p'. Parameters ---------- fname : path The file name without extension. """ from lightgbm import Booster params = pickle.load(open(fname + '.p', "rb")) n_features = params['meta']['n_features'] n_classes = params['meta']['n_classes'] param_map = params['param_map'] model = HyperLGBMClassifier(**param_map) model.set_n_labels(n_classes - 1) y = [i for i in range(n_classes)] model.set_le(y) model.set_n_features_(n_features) model._Booster = Booster(model_file=fname) return model
def compute_importances(importances: pd.DataFrame, columns: List[str], model: lgb.Booster, fold: int) -> pd.DataFrame: imp_df = pd.DataFrame() imp_df['feature'] = columns imp_df['gain'] = model.feature_importance('gain') imp_df['fold'] = fold + 1 importances = pd.concat([importances, imp_df], axis=0, sort=False) return importances
def classifier(self): """ Provides a classifier object. It also loads it if it hasn't been loaded yet. This way the classifier is loaded only when it is actually required. """ if self._classifier is None: self._classifier = PixelClassifier(Booster(model_file=self.model_filename)) return self._classifier
def _get_importance(model: lgb.Booster, features: List[str],) -> pd.DataFrame: df = pd.DataFrame() df["feature"] = features df["importance"] = model.feature_importance( importance_type="gain", iteration=model.best_iteration ) return df
def predict(X_test: pd.DataFrame, y_test, gbm: lgb.Booster): # predict pred = gbm.predict(X_test, num_iteration=gbm.best_iteration) y_pred = [] for x in pred: y_pred.append(np.argmax(x)) # Print the precision and recall, among other metrics print( metrics.classification_report(y_test, y_pred, target_names=Categories))
def predict(gbm: lgb.Booster, test_data: pd.DataFrame, full_data: pd.DataFrame, feature_names: List[str]): last_friday = datetime.now() + relativedelta(weekday=FR(-1)) date_string = last_friday.strftime('%Y-%m-%d') print(date_string) live_data = full_data.loc[date_string].copy() live_data.dropna(subset=feature_names, inplace=True) live_data[PREDICTION_NAME] = gbm.predict(live_data[feature_names]) test_data[PREDICTION_NAME] = gbm.predict(test_data[feature_names]) return dict( predicted_live_data=live_data, predicted_test_data=test_data )
def from_model( cls, booster: lightgbm.Booster, *, path: os.PathLike, preprocessor: Optional["Preprocessor"] = None, ) -> "LightGBMCheckpoint": """Create a :py:class:`~ray.air.checkpoint.Checkpoint` that stores a LightGBM model. Args: booster: The LightGBM model to store in the checkpoint. path: The directory where the checkpoint will be stored. preprocessor: A fitted preprocessor to be applied before inference. Returns: An :py:class:`LightGBMCheckpoint` containing the specified ``Estimator``. Examples: >>> from ray.train.lightgbm import LightGBMCheckpoint >>> import lightgbm >>> >>> booster = lightgbm.Booster() # doctest: +SKIP >>> checkpoint = LightGBMCheckpoint.from_model(booster, path=".") # doctest: +SKIP # noqa: #501 You can use a :py:class:`LightGBMCheckpoint` to create an :py:class:`~ray.train.lightgbm.LightGBMPredictor` and preform inference. >>> from ray.train.lightgbm import LightGBMPredictor >>> >>> predictor = LightGBMPredictor.from_checkpoint(checkpoint) # doctest: +SKIP # noqa: #501 """ booster.save_model(os.path.join(path, MODEL_KEY)) if preprocessor: save_preprocessor_to_dir(preprocessor, path) checkpoint = cls.from_directory(path) return checkpoint
class LightgbmOperator(object): def __init__(self, bst_path, model_tag): """ 初始化 Args: bst_path: 通过model.save()保存的地址 """ self.model = Booster(model_file=bst_path) self.model_tag = model_tag def predict(self, input_datas): # if not isinstance(input_datas,list) and not isinstance(input_datas,np.array): return self.model.predict(input_datas)
def to_air_checkpoint( path: str, booster: lightgbm.Booster, preprocessor: Optional["Preprocessor"] = None, ) -> Checkpoint: """Convert a pretrained model to AIR checkpoint for serve or inference. Args: path: The directory path where model and preprocessor steps are stored to. booster: A pretrained lightgbm model. preprocessor: A fitted preprocessor. The preprocessing logic will be applied to serve/inference. Returns: A Ray Air checkpoint. """ booster.save_model(os.path.join(path, MODEL_KEY)) if preprocessor: save_preprocessor_to_dir(preprocessor, path) checkpoint = Checkpoint.from_directory(path) return checkpoint
def predict_single_fold(self, model: lgb.Booster, dataset: TabularDataset) -> np.ndarray: """Predict target values for dataset. Args: model: Lightgbm object. dataset: test dataset. Return: predicted target values. """ pred = self.task.losses['lgb'].bw_func(model.predict(dataset.data)) return pred
def __init__(self, model_config_dict: dict, threads: int = 1): """Initialise the tree model variables used in the application of RainForests Calibration. LightGBM Boosters are used for tree model predictors. Args: model_config_dict: Dictionary containing Rainforests model configuration variables. threads: Number of threads to use during prediction with tree-model objects. Dictionary is of format:: { "-50.0" : { "lightgbm_model" : "<path_to_lightgbm_model_object>" }, "-25.0" : { "lightgbm_model" : "<path_to_lightgbm_model_object>" }, ..., "50.0" : { "lightgbm_model" : "<path_to_lightgbm_model_object>" } } The keys specify the error threshold value, while the associated values are the path to the corresponding tree-model objects for that threshold. """ from lightgbm import Booster # Dictionary keys represent error thresholds, however may be strings as they # are sourced from json files. In order use these in processing, and to sort # them in a sensible fashion, we shall cast the key values as float32. sorted_model_config_dict = OrderedDict( sorted({np.float32(k): v for k, v in model_config_dict.items()}.items())) self.error_thresholds = np.array([*sorted_model_config_dict.keys()]) lightgbm_model_filenames = [ Path(threshold_dict.get("lightgbm_model")).expanduser() for threshold_dict in sorted_model_config_dict.values() ] self.tree_models = [ Booster(model_file=str(file)).reset_parameter( {"num_threads": threads}) for file in lightgbm_model_filenames ]
def get_surrogate_booster_pyspark(filtered_df, analyzer, max_depth, num_leaves, min_child_samples): """Get surrogate booster for pyspark dataframe. Creates the surrogate model trained on errors and returns the booster. :param filtered_df: The filtered dataframe. :type filtered_df: pyspark.sql.DataFrame :param analyzer: The error analyzer containing the categorical features and categories for the full dataset. :type analyzer: BaseAnalyzer :param max_depth: The maximum depth of the surrogate tree trained on errors. :type max_depth: int :param num_leaves: The number of leaves of the surrogate tree trained on errors. :type num_leaves: int :param min_child_samples: The minimal number of data required to create one leaf. :type min_child_samples: int :return: The extracted booster from the surrogate model and the scored dataset. :rtype: (Booster, pyspark.sql.DataFrame) """ # compute the pred_y column scored_data = analyzer.model.transform(filtered_df.to_spark()) diff_data = scored_data.withColumn( DIFF, F.when(F.col(analyzer.true_y) != F.col(PREDICTION), 1).otherwise(0)) if analyzer.model_task == ModelTask.CLASSIFICATION: diff_data = diff_data.drop(PREDICTION, RAW_PREDICTION, PROBABILITY) else: diff_data = diff_data.drop(PREDICTION) model = create_surrogate_model_pyspark(analyzer, diff_data, max_depth, num_leaves, min_child_samples) # TODO: update lightgbm in pyspark to get around file requirement model_path = "./models/lgbmclassifier.model" model.saveNativeModel(model_path) model_file = glob.glob(model_path + '/*.txt')[0] with open(model_file) as f: contents = f.read() booster_args = {'objective': analyzer.model_task} lgbm_booster = Booster(params=booster_args, model_str=contents) return lgbm_booster, diff_data.to_koalas()
def predict( cv_num: int, sp: Split, model: lgb.Booster, model_number: Optional[int] = None ) -> pd.DataFrame: config = Config() d_start: int = config.CV_START_DAYS[cv_num] d_end: int = config.CV_START_DAYS[cv_num] + 28 test_pred = sp.test.copy() test_pred[config.TARGET + "_true"] = test_pred[config.TARGET] test_pred.loc[test_pred.d >= d_start, config.TARGET] = np.nan for d in tqdm(range(d_start, d_end)): test_pred = make_rolling_for_test(test_pred, d, config.features) test_pred.loc[test_pred.d == d, config.TARGET] = model.predict( test_pred.loc[test_pred.d == d, config.features] ) test_pred.loc[test_pred.d == d, "sales_is_zero"] = ( test_pred.loc[test_pred.d == d, "sales"] == 0 ).astype(np.int8) return test_pred
def parallel_test_quant_data_wrapper(input_file_numbers, model, normalize=True, predict_iteration=None, process_count=2): """ input:(file_names,model) output: mean rank rate """ mean_rank_rates = [] file_number_list = [] if predict_iteration: model.save_model('tmp_model.txt', num_iteration=predict_iteration) else: model.save_model('tmp_model.txt') global g_model g_model = Booster(model_file='tmp_model.txt') proc_pool = multiprocessing.Pool(process_count) multi_result = [] for i in input_file_numbers: data_root_path = '%s/datas/Quant-Datas-2.0' % (DATA_ROOT) if normalize: fin_path = '%s/pickle_datas/%s_trans_norm.pickle' % (data_root_path, i) else: fin_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i) data_res = proc_pool.apply_async(test_single_file, args=(fin_path,)) multi_result.append(data_res) proc_pool.close() proc_pool.join() # 合并结果 for i in range(len(multi_result)): tmp_mean_rank_rate, file_n = multi_result[i].get() mean_rank_rates.append(tmp_mean_rank_rate) file_number_list.append(file_n) mean_rank_rate = np.mean(mean_rank_rates) std_rank_rate = np.std(mean_rank_rates) var_rank = np.var(mean_rank_rates) data_process_logger.info( 'Tested %s files, all input files mean rank rate is %s, all input files std is %s, var is %s' % ( len(input_file_numbers), mean_rank_rate, std_rank_rate, var_rank)) return file_number_list, mean_rank_rates
def pipeline_test_lambdarank_wrapper(input_file_numbers, model, normalize=True, predict_iteration=None): """ 进行结果测试 Args: input_file_numbers: model: normalize: predict_iteration: Returns: """ mean_rank_rates = [] file_number_list = [] if predict_iteration: model.save_model('tmp_lambdarank_model.txt', num_iteration=predict_iteration) model = Booster(model_file='tmp_lambdarank_model.txt') for i in input_file_numbers: data_root_path = '%s/datas/Quant_Datas_v3.0' % (DATA_ROOT) if normalize: fin_path = '%s/pickle_datas/%s_trans_norm.pickle' % ( data_root_path, i) else: fin_path = '%s/pickle_datas/%s_trans.pickle' % (data_root_path, i) try: mean_rank_rate = test_single_lambdarank_file(fin_path, model) if mean_rank_rate: mean_rank_rates.append(mean_rank_rate) file_number_list.append(i) except Exception, e: data_process_logger.info( 'test file failed: file path=%s, details=%s' % (fin_path, e))
def save_lgb(model: lgb.Booster, path): model_str = model.model_to_string() f = open(path, 'w') f.write(model_str) f.close()
def predict( m_xgb: xgboost.XGBClassifier, m_lgbm: lightgbm.Booster, test: pd.DataFrame, test_previous: pd.DataFrame, user_summary: "UserSummary", question_features: pd.DataFrame, ) -> Tuple[pd.DataFrame]: """ Predict the probability that the user will answer the current question correctly. Parameters ---------- m: The model object, an xgboost classifier. test: The test data for which to generate predictions. test_previous: The previous group of test data observations, used to update user summary statistics. user_summary: A UserSummary object containing user features, that can be updated with incoming data. question_features: Question features to join on content_id. Returns ------- A tuple of (prediction dataframe, timer dataframe). The timer dataframe is produced to help identify bottlenecks in the prediction pipeline that may cause a timeout on Kaggle. """ timer = {} if test_previous is not None: tic = datetime.utcnow() newdata = process_test_observations(test, test_previous, question_features) toc = datetime.utcnow() timer["process_test_observations"] = (toc - tic).total_seconds() tic = datetime.utcnow() user_summary.update(newdata) toc = datetime.utcnow() timer["update_user_summary"] = (toc - tic).total_seconds() test = test.loc[test["content_type_id"] == 0].drop( columns="content_type_id") tic = datetime.utcnow() test = pd.merge( test, question_features, how="left", left_on="content_id", right_index=True, copy=False, ) toc = datetime.utcnow() timer["merge_question_features"] = (toc - tic).total_seconds() tic = datetime.utcnow() required_columns = [ k for k in constants.USER_SUMMARY_SCHEMA.keys() if k != "user_id" ] for col in required_columns: test[col] = [ user_summary.get_feature(user_id, col) for user_id in test["user_id"] ] calculate_user_features(test, inplace=True) toc = datetime.utcnow() timer["merge_user_features"] = (toc - tic).total_seconds() tic = datetime.utcnow() # test["answered_correctly"] = m_xgb.predict_proba(test[constants.TRAIN_COLS])[:, 1] test["answered_correctly"] = m_lgbm.predict(test[constants.TRAIN_COLS]) toc = datetime.utcnow() timer["prediction"] = (toc - tic).total_seconds() return test, pd.DataFrame(timer, index=[0])
def _load(properties): """Load a LGBMExplainableModel from the given properties. :param properties: A serialized dictionary representation of the LGBMExplainableModel. :type properties: dict :return: The deserialized LGBMExplainableModel. :rtype: interpret_community.mimic.models.LGBMExplainableModel """ # create the LGBMExplainableModel without any properties using the __new__ function, similar to pickle lgbm_model = LGBMExplainableModel.__new__(LGBMExplainableModel) # Get _n_features _n_features = properties.pop(_N_FEATURES) # If classification case get _n_classes if json.loads(properties[LightGBMSerializationConstants.MULTICLASS]): _n_classes = properties.pop(_N_CLASSES) fitted_ = None if _FITTED in properties: fitted_ = json.loads(properties[_FITTED]) elif version.parse('3.3.1') <= version.parse(lightgbm.__version__): # If deserializing older model in newer version set this to true to prevent errors on calls fitted_ = True # load all of the properties for key, value in properties.items(): # Regenerate the properties on the fly if key in LightGBMSerializationConstants.nonify_properties: if key == LightGBMSerializationConstants.LOGGER: parent = logging.getLogger(__name__) lightgbm_identity = json.loads( properties[LightGBMSerializationConstants.IDENTITY]) lgbm_model.__dict__[key] = parent.getChild( lightgbm_identity) elif key == LightGBMSerializationConstants.TREE_EXPLAINER: lgbm_model.__dict__[key] = None else: raise Exception( "Unknown nonify key on deserialize in LightGBMExplainableModel: {}" .format(key)) elif key in LightGBMSerializationConstants.save_properties: # Load the booster from file and re-create the LGBMClassifier or LGBMRegressor # This is not recommended but can be necessary to get around pickle being not secure # See here for more info: # https://github.com/Microsoft/LightGBM/issues/1942 # https://github.com/Microsoft/LightGBM/issues/1217 booster_args = { LightGBMSerializationConstants.MODEL_STR: value } is_multiclass = json.loads( properties[LightGBMSerializationConstants.MULTICLASS]) if is_multiclass: objective = LightGBMSerializationConstants.MULTICLASS else: objective = LightGBMSerializationConstants.REGRESSION if LightGBMSerializationConstants.MODEL_STR in inspect.getargspec( Booster).args: extras = { LightGBMSerializationConstants.OBJECTIVE: objective } lgbm_booster = Booster(**booster_args, params=extras) else: # For backwards compatibility with older versions of lightgbm booster_args[ LightGBMSerializationConstants.OBJECTIVE] = objective lgbm_booster = Booster(params=booster_args) if is_multiclass: new_lgbm = LGBMClassifier() new_lgbm._Booster = lgbm_booster new_lgbm._n_classes = _n_classes else: new_lgbm = LGBMRegressor() new_lgbm._Booster = lgbm_booster # Specify fitted_ for newer versions of lightgbm on deserialize if fitted_ is not None: new_lgbm.fitted_ = fitted_ new_lgbm._n_features = _n_features lgbm_model.__dict__[key] = new_lgbm elif key in LightGBMSerializationConstants.enum_properties: # NOTE: If more enums added in future, will need to handle this differently lgbm_model.__dict__[key] = ShapValuesOutput(json.loads(value)) else: lgbm_model.__dict__[key] = json.loads(value) return lgbm_model
def model_evaluate(self, dt: pd.DataFrame, prob: float = 0.5, model: lgb.Booster = None): """ Evaluate model on given data frame. Produce probability plots, AUC, average PR, F1, Precision, Recall and confusion matrix. Args: dt: data frame with labels and scores to evaluate prob: threshold to count probabilities as ones model: model to evaluate """ if not model: model = self.lgb_model dt_eval = dt dt_eval["preds"] = model.predict(dt_eval[model.feature_name()]) dt_eval["preds"].head() sns.distplot(dt_eval["preds"], axlabel='Full distribution') plt.show() sns.distplot(dt_eval.loc[dt_eval['label'] == 1, "preds"], axlabel='Ones distribution') plt.show() sns.distplot(dt_eval.loc[dt_eval['label'] == 0, "preds"], axlabel='Zeros distribution') plt.show() sns.distplot(dt_eval.loc[dt_eval['label'] == 1, "preds"], axlabel='Ones distribution', kde=False) sns.distplot(dt_eval.loc[dt_eval['label'] == 0, "preds"], axlabel='Zeros distribution', kde=False) plt.show() preds = [0 if x < prob else 1 for x in dt_eval["preds"]] cm = confusion_matrix(dt_eval['label'].values, preds) df_cm = pd.DataFrame(cm) sns.heatmap(df_cm, annot=True) plt.show() a_score = accuracy_score(dt_eval['label'].values, preds, normalize=True) print("Accuracy score: {}\n".format(a_score)) class_report = classification_report(dt_eval['label'].values, preds, target_names=["Zeros", "Ones"]) print(class_report) total = sum(dt_eval['label'].values) predicted = sum(preds) print("Total positive labels: {}. Positive labels predicted: {}\n". format(total, predicted)) average_precision = average_precision_score(dt_eval['label'], dt_eval['preds']) print('Average precision-recall score: {0:0.2f}'.format( average_precision)) precision, recall, _ = precision_recall_curve(dt_eval['label'], dt_eval['preds'], pos_label=1) plt.step(recall, precision, color='b', alpha=0.2, where='post') plt.fill_between(recall, precision, step='post', alpha=0.2, color='b') plt.xlabel('Recall') plt.ylabel('Precision') plt.ylim([0.0, 1.05]) plt.xlim([0.0, 1.0]) plt.title('2-class Precision-Recall curve: AP={0:0.2f}'.format( average_precision)) plt.show()
import pandas as pd import numpy as np import json from lightgbm import Booster from flask import Flask, jsonify, request, current_app app = Flask(__name__) app.config['MODEL'] = Booster(model_file="model.txt") @app.route("/predict", methods=["GET", "POST"]) def predict(): data = {"success": False} df = request.json if df is not None: df = pd.read_json(df) df["primary_use"] = pd.Categorical(df["primary_use"]) df["meter"] = pd.Categorical(df["meter"]) df["hour"] = pd.Categorical(df["hour"]) df["weekday"] = pd.Categorical(df["weekday"]) model = current_app.config['MODEL'] data = { "success": True, "prediction": np.expm1(model.predict(df)).tolist() } return jsonify(data)
def mean_match_function_kdtree_cat( mmc, model: Booster, bachelor_features, candidate_values, random_state, hashed_seeds, candidate_preds=None, ): """ This mean matching function selects categorical features by performing nearest neighbors on the output class probabilities. This tends to be more accurate, but takes more time, especially for variables with large number of classes. This function is slower for categorical datatypes, but results in better imputations. .. code-block:: text Mean match procedure for different datatypes: Categorical: If mmc = 0, the class with the highest probability is chosen. If mmc > 0, get N nearest neighbors from class probabilities. Select 1 at random. Numeric: If mmc = 0, the predicted value is used If mmc > 0, obtain the mmc closest candidate predictions and collect the associated real candidate values. Choose 1 randomly. Parameters ---------- mmc: int The number of mean matching candidates (derived from mean_match_candidates parameter) model: lgb.Booster The model that was trained. candidate_features: pd.DataFrame or np.ndarray The features used to train the model. If mmc == 0, this will be None. bachelor_features: pd.DataFrame or np.ndarray The features corresponding to the missing values of the response variable used to train the model. candidate_values: pd.Series or np.ndarray The real (not predicted) values of the candidates from the original dataset. Will be 1D If the feature is pandas categorical, this will be the category codes. random_state: np.random.RandomState The random state from the process calling this function is passed. hashed_seeds: None, np.ndarray (int32) Used to make imputations deterministic at the record level. If this array is passed, random_state is ignored in favor of these seeds. These seeds are derived as a hash of the random_seed_array passed to the imputation functions. The distribution of these seeds is uniform enough. Returns ------- The imputation values Must be np.ndarray or shape (n,), where n is the length of dimension 1 of bachelor_features. If the feature is categorical, return its category code (integer corresponding to its category). """ objective = model.params["objective"] assert objective in _REGRESSIVE_OBJECTIVES + _CATEGORICAL_OBJECTIVES, ( "lightgbm objective not recognized - please check for aliases or " + "define a custom mean matching function to handle this objective.") # Need these no matter what. bachelor_preds = model.predict(bachelor_features) if mmc == 0: if objective in _REGRESSIVE_OBJECTIVES: imp_values = bachelor_preds elif objective == "binary": imp_values = np.floor(bachelor_preds + 0.5) elif objective in ["multiclass", "multiclassova"]: imp_values = np.argmax(bachelor_preds, axis=1) else: if objective in _REGRESSIVE_OBJECTIVES: imp_values = _mean_match_reg( mmc, bachelor_preds, candidate_preds, candidate_values, random_state, hashed_seeds, ) elif objective == "binary": bachelor_preds = logodds(bachelor_preds) imp_values = _mean_match_reg( mmc, bachelor_preds, candidate_preds, candidate_values, random_state, hashed_seeds, ) elif objective in ["multiclass", "multiclassova"]: # inner_predict returns a flat array, need to reshape for KDTree bachelor_preds = logodds(bachelor_preds) imp_values = _mean_match_multiclass_accurate( mmc, bachelor_preds, candidate_preds, candidate_values, random_state, hashed_seeds, ) return imp_values
def load(self): with open(self.path_to_data, 'rb') as f: self.weekday_mean_data, self.hour_average = pickle.load(f) self.booster = Booster(model_file=self.path_to_weights)
def booster_fixture(): package_path = os.path.dirname(s2cloudless.__file__) model_path = os.path.join(package_path, 'models', MODEL_FILENAME) return Booster(model_file=model_path)
def predict(booster: lgb.Booster, dtest: pd.DataFrame, dist: str, pred_type: str, n_samples: int = 1000, quantiles: list = [0.1, 0.5, 0.9], seed: str = 123): '''A customized lightgbmlss prediction function. booster: lgb.Booster Trained LightGBMLSS-Model X: pd.DataFrame Test Data dist: str Specifies the distributional assumption. pred_type: str Specifies what is to be predicted: "response" draws n_samples from the predicted response distribution. "quantile" calculates the quantiles from the predicted response distribution. "parameters" returns the predicted distributional parameters. "expectiles" returns the predicted expectiles. n_samples: int If pred_type="response" specifies how many samples are drawn from the predicted response distribution. quantiles: list If pred_type="quantiles" calculates the quantiles from the predicted response distribution. seed: int If pred_type="response" specifies the seed for drawing samples from the predicted response distribution. ''' dict_param = dist.param_dict() predt = booster.predict(dtest, raw_score=True) # Set init_score as starting point for each distributional parameter. init_score_pred = (np.ones(shape=(dtest.shape[0], 1))) * dist.start_values dist_params_predts = [] # The prediction result doesn't include the init_score specified in creating the train data. # Hence, it needs to be added manually with the corresponding transform for each distributional parameter. for i, (dist_param, response_fun) in enumerate(dict_param.items()): dist_params_predts.append( response_fun(predt[:, i] + init_score_pred[:, i])) dist_params_df = pd.DataFrame(dist_params_predts).T dist_params_df.columns = dict_param.keys() if pred_type == "parameters": return dist_params_df elif pred_type == "expectiles": return dist_params_df elif pred_type == "response": pred_resp_df = dist.pred_dist_rvs(pred_params=dist_params_df, n_samples=n_samples, seed=seed) pred_resp_df.columns = [ str("y_pred_sample_") + str(i) for i in range(pred_resp_df.shape[1]) ] return pred_resp_df elif pred_type == "quantiles": pred_quant_df = dist.pred_dist_quantile(quantiles=quantiles, pred_params=dist_params_df) pred_quant_df.columns = [ str("quant_") + str(quantiles[i]) for i in range(len(quantiles)) ] return pred_quant_df
def get_num_trees(booster: lgbm.Booster) -> int: return booster.current_iteration()
def __init__(self, model_config_dict: dict, threads: int): """Initialise the tree model variables used in the application of RainForests Calibration. Args: model_config_dict: Dictionary containing Rainforests model configuration variables. threads: Number of threads to use during prediction with tree-model objects. Dictionary is of format:: { "-50.0" : { "lightgbm_model" : "<path_to_lightgbm_model_object>", "treelite_model" : "<path_to_treelite_model_object>" }, "-25.0" : { "lightgbm_model" : "<path_to_lightgbm_model_object>", "treelite_model" : "<path_to_treelite_model_object>" }, ..., "50.0" : { "lightgbm_model" : "<path_to_lightgbm_model_object>", "treelite_model" : "<path_to_treelite_model_object>" } } The keys specify the error threshold value, while the associated values are the path to the corresponding tree-model objects for that threshold. Treelite predictors are used if treelite_runitme is an installed dependency and an associated path has been provided for all thresholds, otherwise lightgbm Boosters are used as the default tree model type. """ from lightgbm import Booster try: from treelite_runtime import Predictor except ModuleNotFoundError: warnings.warn( "Module treelite_runtime unavailable. Defaulting to using lightgbm Boosters." ) self.treelite_enabled = False else: self.treelite_enabled = True # Dictionary keys represent error thresholds, however may be strings as they # are sourced from json files. In order use these in processing, and to sort # them in a sensible fashion, we shall cast the key values as float32. sorted_model_config_dict = OrderedDict( sorted({np.float32(k): v for k, v in model_config_dict.items()}.items()) ) self.error_thresholds = np.array([*sorted_model_config_dict.keys()]) lightgbm_model_filenames = [ threshold_dict.get("lightgbm_model") for threshold_dict in sorted_model_config_dict.values() ] treelite_model_filenames = [ threshold_dict.get("treelite_model") for threshold_dict in sorted_model_config_dict.values() ] if (None not in treelite_model_filenames) and self.treelite_enabled: self.tree_models = [ Predictor(libpath=file, verbose=False, nthread=threads) for file in treelite_model_filenames ] else: if None in lightgbm_model_filenames: raise ValueError( "Path to lightgbm model missing for one or more error thresholds " "in model_config_dict." ) self.tree_models = [ Booster(model_file=file).reset_parameter({"num_threads": threads}) for file in lightgbm_model_filenames ]