def get_results(X): categoricals = X.select_dtypes(include='object') categoricals = categoricals.astype(str) categoricals = categoricals.apply(label.fit_transform) label_encoding = categoricals['country'] categoricals.drop(['country'], axis=1, inplace=True) X_one = enc.transform(categoricals) encoded_data = pd.DataFrame(X_one.todense()) encoded_data.reset_index(drop=True, inplace=True) categoricals.reset_index(drop=True, inplace=True) original_numeric = X.select_dtypes(include='number') original_numeric.reset_index(drop=True, inplace=True) X = pd.concat([original_numeric, encoded_data, label_encoding], axis=1).values Xp = pca.transform(X) clf = XGBClassifier() booster = Booster() booster.load_model('xgb.model') clf._Booster = booster classes = clf.predict_proba(Xp) y_pred = [0 if c[0] > 0.5 else 1 for c in classes] return y_pred
def predict(self, booster: xgb.Booster, **kwargs): """ Run local XGBoost prediction. Parameters ---------- booster : xgboost.Booster A trained booster. **kwargs : dict Other parameters for `xgboost.Booster.predict`. Returns ------- tuple Pair of IP address of caller and pandas.DataFrame with partial prediction result. """ local_dpredict = self._dpredict booster.set_param({"nthread": self._nthreads}) s = time.time() predictions = pandas.DataFrame( booster.predict(local_dpredict["dmatrix"], **kwargs), index=local_dpredict["index"], ) LOGGER.info(f"Local prediction time: {time.time() - s} s") return get_node_ip_address(), predictions
def load_saved_attributes(): global model model = XGBRegressor() booster = Booster() booster.load_model('./ny_taxi_fare') model._Booster = booster
def analyze(self, event): array_list = [ "lepJet_llpdnnx_-1_isLLP_QMU_QQMU", "lepJet_llpdnnx_0_isLLP_QMU_QQMU", "lepJet_llpdnnx_1_isLLP_QMU_QQMU", "lepJet_llpdnnx_2_isLLP_QMU_QQMU", "dimuon_mass", "dimuon_deltaR", "lepJet_pt", "lepJet_eta", "lepJet_deltaR", "MET_pt", "MET_phi", "looseMuons_pt", "looseMuons_eta", "looseMuons_dxy", "tightMuons_pt", "tightMuons_eta", "tightMuons_dxy" ] data = pd.DataFrame(data={ "lepJet_llpdnnx_-1_isLLP_QMU_QQMU": getattr(event, "lepJet_llpdnnx_-1_isLLP_QMU_QQMU"), "lepJet_llpdnnx_0_isLLP_QMU_QQMU": event.lepJet_llpdnnx_0_isLLP_QMU_QQMU, "lepJet_llpdnnx_1_isLLP_QMU_QQMU": event.lepJet_llpdnnx_1_isLLP_QMU_QQMU, "lepJet_llpdnnx_2_isLLP_QMU_QQMU": event.lepJet_llpdnnx_2_isLLP_QMU_QQMU, "dimuon_mass": event.dimuon_mass, "dimuon_deltaR": event.dimuon_deltaR, "lepJet_pt": event.lepJet_pt, "lepJet_eta": event.lepJet_eta, "lepJet_deltaR": event.lepJet_deltaR, "MET_pt": event.MET_pt, "MET_phi": event.MET_phi, "looseMuons_pt": event.looseMuons_pt, "looseMuons_eta": event.looseMuons_eta, "looseMuons_dxy": event.looseMuons_dxy, "tightMuons_pt": event.tightMuons_pt, "tightMuons_eta": event.tightMuons_eta, "tightMuons_dxy": event.tightMuons_dxy, }, columns=array_list, index=[0]) model = XGBClassifier() booster = Booster() #model._le = LabelEncoder().fit([1]) booster.load_model(self.modelPath) booster.feature_names = array_list model._Booster = booster bdt_score = model.predict_proba(data) setattr(event, "bdt_score", bdt_score[:, 1]) return True
def get_model(cls, algorithm_name: str, model_path: str): if algorithm_name == 'xgboost': model = xgb.XGBClassifier() booster = Booster() booster.load_model(model_path) model._Booster = booster else: model = joblib.load(model_path) return model
def test(x,modelFile): model = Booster() #init model model.load_model(modelFile) # load data maps=np.load("Map.npy",allow_pickle=True) x_enc=encode([x]) y_enc=model.predict(DMatrix(x_enc)) y_pred=np.argmax(y_enc) inverseMap=maps.item().get("inverseMap") y_hat=inverseMap[y_pred] print(y_hat)
def predict(self, booster: xgb.Booster, *args, **kwargs): local_dpredict = self._dpredict booster.set_param({"nthread": self._nthreads}) s = time.time() predictions = [ booster.predict(X, *args, **kwargs) for X in local_dpredict ] LOGGER.info(f"Local prediction time: {time.time() - s} s") return np.concatenate(predictions)
def train_xgb(X, y, params, save_path=None, save_path_booster=None): # the threshold is not handled by XGB interface params, binary_threshold = _parse_param_and_delete(params, 'binary_threshold', .5) # n_jobs is handled by XGB SKL interface params = _parse_param_and_keep(params, name='n_jobs', default=min(max_cpu_count(), 24)) X = np.asarray(X) y = np.asarray(y).flatten() if not tuple(np.sort(np.unique(y))) == (0, 1): raise NotImplementedError( 'XGB Wrapper currently only support biinary classification.') # Fit the model model = XGBClassifier(use_label_encoder=False, ) model = clone(model) model.set_params(**params) logging.info('Training...') model.fit( X, y, # early_stopping_rounds=10, verbose=True, ) # Save and re-load (feature-agnostic model) temp_file = f'temp-{time.time()}-{random.random()}.bin' model.get_booster().save_model(temp_file) booster = Booster(model_file=temp_file) os.remove(temp_file) if binary_threshold == 'auto': p_ = booster.predict(DMatrix(X)) p_ = np.sort(p_) binary_threshold = p_[int((y == 0).sum())] logging.info(f'Using a binary_threshold = {binary_threshold}') # Wrap model = XGBClassifierSKLWrapper(booster, features=X.shape[1], threshold=binary_threshold) # Save if save_path is not None: save_pickle(model, save_path) if save_path_booster is not None: save_pickle(model.get_booster(), save_path_booster) return model
def predict(self, booster: xgb.Booster, **kwargs): local_dpredict = self._dpredict booster.set_param({"nthread": self._nthreads}) s = time.time() predictions = [ pandas.DataFrame(booster.predict(X, **kwargs)) for X in local_dpredict ] LOGGER.info(f"Local prediction time: {time.time() - s} s") return predictions if len(predictions) > 1 else predictions[0]
def create_predictor_infos(): word_index = {} n_tokens = 0 with open('../data/output/word_frequency.pkl', 'rb') as f: word_frequency = cPickle.load(f) assert type(word_frequency) == dict for k, v in sorted(word_frequency.items(), key=lambda x: x[1]): if v > THRESHOLD_FREQ: word_index[k] = n_tokens n_tokens += 1 bst = Booster() bst.load_model('../data/model/xgboost_model.model') return word_index, n_tokens, bst
def predict(self, booster: xgb.Booster, **kwargs): local_dpredict = self._dpredict booster.set_param({"nthread": self._nthreads}) s = time.time() predictions = pandas.DataFrame( booster.predict(local_dpredict["dmatrix"], **kwargs), index=local_dpredict["index"], ) LOGGER.info(f"Local prediction time: {time.time() - s} s") return get_node_ip_address(), predictions
def test_it_can_override_an_existing_one(self) -> None: station_id = uuid.uuid4() first = StationAvailabilityAlgorithm( station_id, DataFrame(['data_1', 'frame_1', 'test_1']), Booster()) self._repository.save(first) override = StationAvailabilityAlgorithm( station_id, DataFrame(['data_2', 'frame_2', 'test_2']), Booster()) self._repository.save(override) self.assertEqual(self._repository.find_by_station_id(station_id), override)
def xgb_latest() -> (Booster, Dict[str, pandas.Categorical]): base_path = '/var/opt/pcsml/devel/training_data/dumps/debug004/2017-12-27T18-30-59' model = Booster() model.load_model(os.path.join(base_path, 'model_2017-12-27T18-30-59.xgb')) with gzip.open( os.path.join( base_path, 'model_2017-12-27T18-30-59_column_categories.pickle.gz'), 'rb') as f: column_categories = pickle.load(f) return model, column_categories
def select_stocks(context, data): #clf = pickle.load(BytesIO(read_file('xgb_factors_model_ZZ800_D.model'))) #file = read_file('xgb_factors_model_ZZ800_D.model') #clf = Booster.load_model(fname = BytesIO(read_file('xgb_factors_model_ZZ800_D.model'))) with open('temp', 'wb') as f: f.write(read_file('xgb_factors.model')) #储存一个临时文件,进程结束后清理 clf = Booster(model_file='temp') #clf = Booster.load_model(fname = 'temp') industry_old_code = ['801010','801020','801030','801040','801050','801080','801110','801120','801130','801140','801150',\ '801160','801170','801180','801200','801210','801230'] industry_new_code = ['801010','801020','801030','801040','801050','801080','801110','801120','801130','801140','801150',\ '801160','801170','801180','801200','801210','801230','801710','801720','801730','801740','801750',\ '801760','801770','801780','801790','801880','801890'] starttime = datetime.datetime.now() date = context.previous_date #获取行业因子数据 print('获取数据的日期:', date) ''' if datetime.datetime.strptime(date,"%Y-%m-%d").date()<datetime.date(2014,2,21): industry_code=industry_old_code else: ''' industry_code = industry_new_code stockList = get_stock('ZZ800', date) factor_origl_data = get_factor_data(stockList, date) factor_solve_data = data_preprocessing(factor_origl_data, stockList, industry_code, date) endtime = datetime.datetime.now() print('取数运行时长:', int((endtime - starttime).seconds / 60), '分钟') test_feature_or = factor_solve_data.copy() test_feature = np.array(test_feature_or) # 模型预测 test_predict = clf.predict(DMatrix(test_feature_or)) test_sample_predict = pd.DataFrame(data=test_predict, index=test_feature_or.index, columns=[ 'XGB_predict_0', 'XGB_predict_1', 'XGB_predict_2', 'XGB_predict_3', 'XGB_predict_4', 'XGB_predict_5', 'XGB_predict_6', 'XGB_predict_7', 'XGB_predict_8', 'XGB_predict_9', 'XGB_predict_10', 'XGB_predict_11' ]) #test_sample_predict['XGB_predict_0_and_1'] = test_sample_predict['XGB_predict_0'] + test_sample_predict['XGB_predict_1'] test_sample_predict = test_sample_predict.sort_values(by='XGB_predict_0', ascending=False) stock_list = test_sample_predict.index.values.tolist() stock_list = stock_list[:g.buy_stock_count] return stock_list
def upload_xgb_to_memsql(xgb: Booster, conn: Connection, udf_name: str, func=F.SIGMOID, feature_names: List[str] = None, allow_overwrite: bool = False) -> None: if feature_names: xgb.feature_names = feature_names trees = split_trees(xgb.trees_to_dataframe()) sqls = [tree_to_func_def(udf_name, allow_overwrite, t) for t in trees] sqls.append( tree_to_main_func(udf_name, allow_overwrite, trees, xgb.feature_names, func)) for s in sqls: assert 1 == conn.query(s)
def test_model(self, model: xgb.Booster, df: pd.DataFrame) -> (str, float): num_cols = [ "shipping_free", "price", "accepts_mercadopago", "automatic_relist", "initial_quantity", "sold_quantity", "available_quantity", "quantity" ] cat_cols = [ c for c in df.columns if c not in num_cols and c not in ("target") ] df = self.feature_engineer(df, cat_cols) features = [f for f in df.columns if f not in ("target")] for col in features: if col not in num_cols: lbl = LabelEncoder() lbl.fit(df[col]) df.loc[:, col] = lbl.transform(df[col]) X = df.drop(["target"], axis=1).values y = df.target.values preds = model.predict_proba(X)[:, 1] auc = metrics.roc_auc_score(y, preds) logger.info(f"AUC Test = {auc}")
def load_or_create(objective='multi:softprob', max_depth=2, seed=4242, eval_metric='merror', num_class=4520, num_feature=256, **kwargs): if from_scratch: print_info('Creating XGB Boosted Tree') params = { 'updater': 'grow_gpu', 'predictor': 'gpu_predictor', 'tree_method': 'gpu_hist', 'eval_metric': eval_metric, 'objective': objective, 'num_class': num_class, 'max_depth': max_depth, 'seed': seed, 'num_feature': num_feature } params = {**params, **kwargs} model = Booster(params, ) else: model = load_model() return model
def _run_xgboost(model: xgb.Booster, data: pd.DataFrame) -> pd.DataFrame: """Retrieve the win probability. Parameters ---------- model : xgb.Booster The fitted XGBoost model. data : pd.DataFrame The input dataset to be evaluated. Returns ------- np.ndarray The updated dataset. """ # First, get the partial hazard values hazard = model.predict( xgb.DMatrix(data[META["static"] + META["dynamic"]])) # Get the cumulative probability c0 = interpolate_at_times(model.cumulative_hazard_, data["stop"].values) new = data.copy() new[META["survival"]] = 1 - np.exp(-(c0 * hazard)) return new
def xgbooster_predict_proba(booster: xgb.Booster, d_x: xgb.DMatrix) -> np.ndarray: """ Simulate the `predict_proba` interface from sklearn This function will only work as expected if `booster` has been training using the `binary:logistic` loss. Parameters ---------- booster : xgboost.Booster The trained booster d_x : xgboost.DMatrix The dataset Returns ------- y_proba_pred : numpy.ndarray The probabilistic predictions. The shape of the array is (n_row, 2). """ y_score = booster.predict(d_x) y_false = 1 - y_score size = (d_x.num_row(), 2) y_probas_pred = np.zeros(size) y_probas_pred[:, 0] = y_false y_probas_pred[:, 1] = y_score return y_probas_pred
def load_xgb_model(fname): """ Load a XGBoost model that was saved as a file with the HyperXGBClassifier.save method. The model is span on two files: * The first file contains the model saved with the Booster class, this file have no extension. * The second file contains the parameters used to create the model, this file have the extension '.p'. Parameters ---------- fname : path The file name without extension. """ from xgboost import Booster params = pickle.load(open(fname + '.p', "rb")) n_classes = params['meta']['n_classes'] param_map = params['param_map'] model = HyperXGBClassifier(**param_map) model.set_n_labels(n_classes - 1) y = [i for i in range(n_classes)] model.set_le(y) model._Booster = Booster(model_file=fname) return model
def get_feature_importances_from_booster(cls, booster: Booster) -> np.ndarray: """Gets feauture importances from a XGB booster. This is based on the feature_importance_ property defined in: https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py Args: booster(Booster): Booster object, most of the times the median model (quantile=0.5) is preferred Returns: (np.ndarray) with normalized feature importances """ # Get score score = booster.get_score(importance_type="gain") # Get feature names from booster feature_names = booster.feature_names # Get importance feature_importance = [score.get(f, 0.0) for f in feature_names] # Convert to array features_importance_array = np.array(feature_importance, dtype=np.float32) total = features_importance_array.sum() # For normalizing if total == 0: return features_importance_array return features_importance_array / total # Normalize
def test_it_can_not_find_one_by_station_id_if_it_does_not_exist( self) -> None: station_availability_algorithm = StationAvailabilityAlgorithm( uuid.uuid4(), DataFrame(['data', 'frame', 'test']), Booster()) self._repository.save(station_availability_algorithm) self.assertIsNone(self._repository.find_by_station_id(uuid.uuid4()))
def after_iteration( self, model: xgb.Booster, epoch: int, evals_log: xgb.callback.TrainingCallback.EvalsLog ): y_pred = model.predict(dmat) acc = np.sum(np.logical_and(y_pred >= y_lower, y_pred <= y_upper)/len(X)) acc_rec.append(acc) return False
def predict(self, model: xgb.Booster, data: RayDMatrix, **kwargs): _set_omp_num_threads() if data not in self._data: self.load_data(data) local_data = self._data[data] predictions = model.predict(local_data, **kwargs) return predictions
def merge_labeled_weight_importance(model: Booster, label_encoder: OneHotLabelEncoder) -> Dict: f_imp = model.get_score(importance_type='weight') merged: Dict[str, int] = {} for f in f_imp: src_feature = label_encoder.source_column(f) merged[src_feature] = merged.get(src_feature, 0) + f_imp[f] return merged
def test_it_can_find_one_by_station_id(self) -> None: station_id = uuid.uuid4() station_availability_algorithm = StationAvailabilityAlgorithm( station_id, DataFrame(['data', 'frame', 'test']), Booster()) self._repository.save(station_availability_algorithm) self.assertEqual(self._repository.find_by_station_id(station_id), station_availability_algorithm)
def merge_labeled_weight_importance( model: Booster, dummy_col_sep=categorical_util.DUMMY_COL_SEP) -> Dict[str, int]: f_imp = model.get_score(importance_type='weight') merged: Dict[str, int] = {} for f in f_imp: src_feature = categorical_util.get_source_name_from_dummy( f, dummy_col_sep) merged[src_feature] = merged.get(src_feature, 0) + f_imp[f] return merged
def load_saved_attributes(): global host_response_time_values global neighbourhood_values global property_type_values global room_type_values global cancellation_policy_values global model with open("columns.json", "r") as f: resp = json.load(f) host_response_time_values = resp["host_response_time"] neighbourhood_values = resp["neighbourhood"] property_type_values = resp["property_type"] room_type_values = resp["room_type"] cancellation_policy_values = resp["cancellation_policy"] model = XGBRegressor() booster = Booster() booster.load_model('airbnb_price_predictor') model._Booster = booster
def load(filename): ''' Loads in an xgboost model from the given file location Parameters ---------- filename : string path of model file to be loaded Returns ------- booster : xgboost.Booster() model that is loaded metadata : dict parameter metadata for model in the form of json data. Use get_params() function to use in model prediction. ''' booster = Booster({'nthread': 4}) # Check if model file exists as it has been writen by user. # If not, add model_ to filename like it designated in save() if not path.exists(filename): model_file = filename.split('/') model_file[-1] = 'model_' + model_file[-1] model_file = '/'.join(model_file) else: model_file = filename config_file = model_file.replace('model_', 'config_') booster.load_model(model_file) with open(config_file, 'r', encoding='utf-8') as f: config = f.read() config = json.loads(config) booster.load_config(config) metadata = json.loads(config) return booster, metadata
def from_model( cls, booster: xgboost.Booster, *, path: os.PathLike, preprocessor: Optional["Preprocessor"] = None, ) -> "XGBoostCheckpoint": """Create a :py:class:`~ray.air.checkpoint.Checkpoint` that stores an XGBoost model. Args: booster: The XGBoost model to store in the checkpoint. path: The directory where the checkpoint will be stored. preprocessor: A fitted preprocessor to be applied before inference. Returns: An :py:class:`XGBoostCheckpoint` containing the specified ``Estimator``. Examples: >>> from ray.train.xgboost import XGBoostCheckpoint >>> import xgboost >>> >>> booster = xgboost.Booster() >>> checkpoint = XGBoostCheckpoint.from_model(booster, path=".") # doctest: +SKIP # noqa: E501 You can use a :py:class:`XGBoostCheckpoint` to create an :py:class:`~ray.train.xgboost.XGBoostPredictor` and preform inference. >>> from ray.train.xgboost import XGBoostPredictor >>> >>> predictor = XGBoostPredictor.from_checkpoint(checkpoint) # doctest: +SKIP # noqa: E501 """ booster.save_model(os.path.join(path, MODEL_KEY)) if preprocessor: save_preprocessor_to_dir(preprocessor, path) checkpoint = cls.from_directory(path) return checkpoint
def to_air_checkpoint( path: str, booster: xgboost.Booster, preprocessor: Optional["Preprocessor"] = None, ) -> Checkpoint: """Convert a pretrained model to AIR checkpoint for serve or inference. Args: path: The directory path where model and preprocessor steps are stored to. booster: A pretrained xgboost model. preprocessor: A fitted preprocessor. The preprocessing logic will be applied to serve/inference. Returns: A Ray Air checkpoint. """ booster.save_model(os.path.join(path, MODEL_KEY)) if preprocessor: save_preprocessor_to_dir(preprocessor, path) checkpoint = Checkpoint.from_directory(path) return checkpoint
def my_train_xgboost(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None, early_stopping_rounds=None, seed=0, rt_eta=1.0006, rt_ssp=1.0006, rt_clb=1.0006, rt_dpt=1.0001): """ Train a booster with given parameters. Parameters ---------- params : dict Booster params. dtrain : DMatrix Data to be trained. num_boost_round: int Number of boosting iterations. watchlist : list of pairs (DMatrix, string) List of items to be evaluated during training, this allows user to watch performance on the validation set. obj : function Customized objective function. feval : function Customized evaluation function. early_stopping_rounds: int Activates early stopping. Validation error needs to decrease at least every <early_stopping_rounds> round(s) to continue training. Requires at least one item in evals. If there's more than one, will use the last. Returns the model from the last iteration (not the best one). If early stopping occurs, the model will have two additional fields: bst.best_score and bst.best_iteration. Returns ------- booster : a trained booster model """ eta = params['eta'] ssp = params['subsample'] clb = params['colsample_bytree'] # rt_eta=np.random.random() rt_ssp=np.random.uniform(0.1,0.9) rt_clb=np.random.uniform(0.1,0.9) evals = list(evals) bst = Booster(params, [dtrain] + [d[0] for d in evals], seed=seed) if not early_stopping_rounds: for i in range(num_boost_round): bst.set_param({'eta': eta}) bst.set_param({'subsample': ssp}) bst.set_param({'colsample_bytree': clb}) eta = eta * rt_eta # ssp = ssp * rt_ssp # clb = clb * rt_clb ssp = rt_ssp clb = rt_clb bst.update(dtrain, i, obj) if len(evals) != 0: bst_eval_set = bst.eval_set(evals, i, feval) if isinstance(bst_eval_set, string_types): sys.stderr.write(bst_eval_set + '\n') else: sys.stderr.write(bst_eval_set.decode() + '\n') return bst else: # early stopping if len(evals) < 1: raise ValueError('For early stopping you need at least on set in evals.') sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(evals[-1][1], early_stopping_rounds)) # is params a list of tuples? are we using multiple eval metrics? if type(params) == list: if len(params) != len(dict(params).items()): raise ValueError('Check your params. Early stopping works with single eval metric only.') params = dict(params) # either minimize loss or maximize AUC/MAP/NDCG maximize_score = False if 'eval_metric' in params: maximize_metrics = ('auc', 'map', 'ndcg') if filter(lambda x: params['eval_metric'].startswith(x), maximize_metrics): maximize_score = True if maximize_score: best_score = 0.0 else: best_score = float('inf') best_msg = '' best_score_i = 0 for i in range(num_boost_round): bst.set_param({'eta': eta}) bst.set_param({'subsample': ssp}) bst.set_param({'colsample_bytree': clb}) eta = eta * rt_eta # ssp = ssp * rt_ssp # clb = clb * rt_clb ssp = rt_ssp clb = rt_clb bst.update(dtrain, i, obj) bst_eval_set = bst.eval_set(evals, i, feval) if isinstance(bst_eval_set, string_types): msg = bst_eval_set else: msg = bst_eval_set.decode() sys.stderr.write(msg + '\n') score = float(msg.rsplit(':', 1)[1]) if (maximize_score and score > best_score) or \ (not maximize_score and score < best_score): best_score = score best_score_i = i best_msg = msg elif i - best_score_i >= early_stopping_rounds: sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg)) bst.best_score = best_score bst.best_iteration = best_score_i return bst return bst