def xgbooster_predict_proba(booster: xgb.Booster, d_x: xgb.DMatrix) -> np.ndarray: """ Simulate the `predict_proba` interface from sklearn This function will only work as expected if `booster` has been training using the `binary:logistic` loss. Parameters ---------- booster : xgboost.Booster The trained booster d_x : xgboost.DMatrix The dataset Returns ------- y_proba_pred : numpy.ndarray The probabilistic predictions. The shape of the array is (n_row, 2). """ y_score = booster.predict(d_x) y_false = 1 - y_score size = (d_x.num_row(), 2) y_probas_pred = np.zeros(size) y_probas_pred[:, 0] = y_false y_probas_pred[:, 1] = y_score return y_probas_pred
def _run_xgboost(model: xgb.Booster, data: pd.DataFrame) -> pd.DataFrame: """Retrieve the win probability. Parameters ---------- model : xgb.Booster The fitted XGBoost model. data : pd.DataFrame The input dataset to be evaluated. Returns ------- np.ndarray The updated dataset. """ # First, get the partial hazard values hazard = model.predict( xgb.DMatrix(data[META["static"] + META["dynamic"]])) # Get the cumulative probability c0 = interpolate_at_times(model.cumulative_hazard_, data["stop"].values) new = data.copy() new[META["survival"]] = 1 - np.exp(-(c0 * hazard)) return new
def predict(self, booster: xgb.Booster, **kwargs): """ Run local XGBoost prediction. Parameters ---------- booster : xgboost.Booster A trained booster. **kwargs : dict Other parameters for `xgboost.Booster.predict`. Returns ------- tuple Pair of IP address of caller and pandas.DataFrame with partial prediction result. """ local_dpredict = self._dpredict booster.set_param({"nthread": self._nthreads}) s = time.time() predictions = pandas.DataFrame( booster.predict(local_dpredict["dmatrix"], **kwargs), index=local_dpredict["index"], ) LOGGER.info(f"Local prediction time: {time.time() - s} s") return get_node_ip_address(), predictions
def after_iteration( self, model: xgb.Booster, epoch: int, evals_log: xgb.callback.TrainingCallback.EvalsLog ): y_pred = model.predict(dmat) acc = np.sum(np.logical_and(y_pred >= y_lower, y_pred <= y_upper)/len(X)) acc_rec.append(acc) return False
def predict(self, model: xgb.Booster, data: RayDMatrix, **kwargs): _set_omp_num_threads() if data not in self._data: self.load_data(data) local_data = self._data[data] predictions = model.predict(local_data, **kwargs) return predictions
def test(x,modelFile): model = Booster() #init model model.load_model(modelFile) # load data maps=np.load("Map.npy",allow_pickle=True) x_enc=encode([x]) y_enc=model.predict(DMatrix(x_enc)) y_pred=np.argmax(y_enc) inverseMap=maps.item().get("inverseMap") y_hat=inverseMap[y_pred] print(y_hat)
def predict(self, booster: xgb.Booster, *args, **kwargs): local_dpredict = self._dpredict booster.set_param({"nthread": self._nthreads}) s = time.time() predictions = [ booster.predict(X, *args, **kwargs) for X in local_dpredict ] LOGGER.info(f"Local prediction time: {time.time() - s} s") return np.concatenate(predictions)
def predict(self, booster: xgb.Booster, **kwargs): local_dpredict = self._dpredict booster.set_param({"nthread": self._nthreads}) s = time.time() predictions = [ pandas.DataFrame(booster.predict(X, **kwargs)) for X in local_dpredict ] LOGGER.info(f"Local prediction time: {time.time() - s} s") return predictions if len(predictions) > 1 else predictions[0]
def train_xgb(X, y, params, save_path=None, save_path_booster=None): # the threshold is not handled by XGB interface params, binary_threshold = _parse_param_and_delete(params, 'binary_threshold', .5) # n_jobs is handled by XGB SKL interface params = _parse_param_and_keep(params, name='n_jobs', default=min(max_cpu_count(), 24)) X = np.asarray(X) y = np.asarray(y).flatten() if not tuple(np.sort(np.unique(y))) == (0, 1): raise NotImplementedError( 'XGB Wrapper currently only support biinary classification.') # Fit the model model = XGBClassifier(use_label_encoder=False, ) model = clone(model) model.set_params(**params) logging.info('Training...') model.fit( X, y, # early_stopping_rounds=10, verbose=True, ) # Save and re-load (feature-agnostic model) temp_file = f'temp-{time.time()}-{random.random()}.bin' model.get_booster().save_model(temp_file) booster = Booster(model_file=temp_file) os.remove(temp_file) if binary_threshold == 'auto': p_ = booster.predict(DMatrix(X)) p_ = np.sort(p_) binary_threshold = p_[int((y == 0).sum())] logging.info(f'Using a binary_threshold = {binary_threshold}') # Wrap model = XGBClassifierSKLWrapper(booster, features=X.shape[1], threshold=binary_threshold) # Save if save_path is not None: save_pickle(model, save_path) if save_path_booster is not None: save_pickle(model.get_booster(), save_path_booster) return model
def predict(self, booster: xgb.Booster, **kwargs): local_dpredict = self._dpredict booster.set_param({"nthread": self._nthreads}) s = time.time() predictions = pandas.DataFrame( booster.predict(local_dpredict["dmatrix"], **kwargs), index=local_dpredict["index"], ) LOGGER.info(f"Local prediction time: {time.time() - s} s") return get_node_ip_address(), predictions
def select_stocks(context, data): #clf = pickle.load(BytesIO(read_file('xgb_factors_model_ZZ800_D.model'))) #file = read_file('xgb_factors_model_ZZ800_D.model') #clf = Booster.load_model(fname = BytesIO(read_file('xgb_factors_model_ZZ800_D.model'))) with open('temp', 'wb') as f: f.write(read_file('xgb_factors.model')) #储存一个临时文件,进程结束后清理 clf = Booster(model_file='temp') #clf = Booster.load_model(fname = 'temp') industry_old_code = ['801010','801020','801030','801040','801050','801080','801110','801120','801130','801140','801150',\ '801160','801170','801180','801200','801210','801230'] industry_new_code = ['801010','801020','801030','801040','801050','801080','801110','801120','801130','801140','801150',\ '801160','801170','801180','801200','801210','801230','801710','801720','801730','801740','801750',\ '801760','801770','801780','801790','801880','801890'] starttime = datetime.datetime.now() date = context.previous_date #获取行业因子数据 print('获取数据的日期:', date) ''' if datetime.datetime.strptime(date,"%Y-%m-%d").date()<datetime.date(2014,2,21): industry_code=industry_old_code else: ''' industry_code = industry_new_code stockList = get_stock('ZZ800', date) factor_origl_data = get_factor_data(stockList, date) factor_solve_data = data_preprocessing(factor_origl_data, stockList, industry_code, date) endtime = datetime.datetime.now() print('取数运行时长:', int((endtime - starttime).seconds / 60), '分钟') test_feature_or = factor_solve_data.copy() test_feature = np.array(test_feature_or) # 模型预测 test_predict = clf.predict(DMatrix(test_feature_or)) test_sample_predict = pd.DataFrame(data=test_predict, index=test_feature_or.index, columns=[ 'XGB_predict_0', 'XGB_predict_1', 'XGB_predict_2', 'XGB_predict_3', 'XGB_predict_4', 'XGB_predict_5', 'XGB_predict_6', 'XGB_predict_7', 'XGB_predict_8', 'XGB_predict_9', 'XGB_predict_10', 'XGB_predict_11' ]) #test_sample_predict['XGB_predict_0_and_1'] = test_sample_predict['XGB_predict_0'] + test_sample_predict['XGB_predict_1'] test_sample_predict = test_sample_predict.sort_values(by='XGB_predict_0', ascending=False) stock_list = test_sample_predict.index.values.tolist() stock_list = stock_list[:g.buy_stock_count] return stock_list
def predict(booster: xgb.Booster, X): '''A customized prediction function that converts raw prediction to target class. ''' # Output margin means we want to obtain the raw prediction obtained from # tree leaf weight. predt = booster.predict(X, output_margin=True) out = np.zeros(kRows) for r in range(predt.shape[0]): # the class with maximum prob (not strictly prob as it haven't gone # through softmax yet so it doesn't sum to 1, but result is the same # for argmax). i = np.argmax(predt[r]) out[r] = i return out
def _generate_cumulative_hazard( model: xgb.Booster, train_data: pd.DataFrame, dtrain: xgb.DMatrix, ) -> np.ndarray: """Generate the cumulative hazard. Parameters ---------- model : xgb.Booster The trained model. train_data : pd.DataFrame The training dataset. dtrain : xgb.DMatrix The training dataset. Returns ------- np.ndarray The array output. """ # First, get the partial hazard values hazard = model.predict(dtrain) # Get the unique failure times unique_death_times = np.unique( train_data.loc[train_data[META["event"]] == 1, "stop"].values) baseline_hazard_ = pd.DataFrame( np.zeros_like(unique_death_times), index=unique_death_times, columns=["baseline hazard"], ) for t in unique_death_times: ix = (train_data["start"].values < t) & (t <= train_data["stop"].values) events_at_t = train_data[META["event"]].values[ix] stops_at_t = train_data["stop"].values[ix] hazards_at_t = hazard[ix] deaths = events_at_t & (stops_at_t == t) death_counts = deaths.sum() baseline_hazard_.loc[t] = death_counts / hazards_at_t.sum() return baseline_hazard_.cumsum()
def test_model(model: Booster, Xt, yt, test_indices): d_test = DMatrix(Xt, label=yt) random_successful = 0. model_successful = 0. prob_all = model.predict(d_test, True) start = 0 for i in test_indices: end = i y = yt[start:end] if y.sum() != 1: logger.warning(f'Sum of y is not 1, {(y.sum(), start, end, y)}') random_successful += 1 / (end - start) prob = prob_all[start:end] if yt[start + prob.argmax()] > 0: model_successful += 1 start = end return random_successful, model_successful
"like_pppprepost": [float(sys.argv[19])], "happiness": [float(sys.argv[20])], "love": [float(sys.argv[21])], "sadness": [float(sys.argv[22])], "travel": [float(sys.argv[23])], "food": [float(sys.argv[24])], "pet": [float(sys.argv[25])], "angry": [float(sys.argv[26])], "music": [float(sys.argv[27])], "party": [float(sys.argv[28])], "sport": [float(sys.argv[29])], "hashtags_pop_1": [0], "hashtags_pop_2": [0], "hashtags_pop_3": [0], "hashtags_pop_4": [0], "hashtags_pop_5": [0], "hashtags_pop_6": [0], "hashtags_pop_7": [0], "hashtags_pop_8": [0], "hashtags_pop_9": [0], "hashtags_pop_10": [0], "baseline": [float(sys.argv[30])] } df = pd.DataFrame.from_dict(input_item) model = Booster() model.load_model('Predictor/xgb_if_model_2020_v1.json') output = model.predict(xgboost.DMatrix(df)) print(output)
def run_slice(self, booster: xgb.Booster, dtrain: xgb.DMatrix, num_parallel_tree: int, num_classes: int, num_boost_round: int): beg = 3 end = 7 sliced: xgb.Booster = booster[beg:end] assert sliced.feature_types == booster.feature_types sliced_trees = (end - beg) * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) sliced_trees = sliced_trees // 2 sliced = booster[beg:end:2] assert sliced_trees == len(sliced.get_dump()) sliced = booster[beg:...] sliced_trees = (num_boost_round - beg) * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) sliced = booster[beg:] sliced_trees = (num_boost_round - beg) * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) sliced = booster[:end] sliced_trees = end * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) sliced = booster[...:end] sliced_trees = end * num_parallel_tree * num_classes assert sliced_trees == len(sliced.get_dump()) with pytest.raises(ValueError, match=r">= 0"): booster[-1:0] # we do not accept empty slice. with pytest.raises(ValueError): booster[1:1] # stop can not be smaller than begin with pytest.raises(ValueError, match=r"Invalid.*"): booster[3:0] with pytest.raises(ValueError, match=r"Invalid.*"): booster[3:-1] # negative step is not supported. with pytest.raises(ValueError, match=r".*>= 1.*"): booster[0:2:-1] # step can not be 0. with pytest.raises(ValueError, match=r".*>= 1.*"): booster[0:2:0] trees = [_ for _ in booster] assert len(trees) == num_boost_round with pytest.raises(TypeError): booster["wrong type"] with pytest.raises(IndexError): booster[:num_boost_round + 1] with pytest.raises(ValueError): booster[1, 2] # too many dims # setitem is not implemented as model is immutable during slicing. with pytest.raises(TypeError): booster[...:end] = booster sliced_0 = booster[1:3] np.testing.assert_allclose( booster.predict(dtrain, iteration_range=(1, 3)), sliced_0.predict(dtrain)) sliced_1 = booster[3:7] np.testing.assert_allclose( booster.predict(dtrain, iteration_range=(3, 7)), sliced_1.predict(dtrain)) predt_0 = sliced_0.predict(dtrain, output_margin=True) predt_1 = sliced_1.predict(dtrain, output_margin=True) merged = predt_0 + predt_1 - 0.5 # base score. single = booster[1:7].predict(dtrain, output_margin=True) np.testing.assert_allclose(merged, single, atol=1e-6) sliced_0 = booster[1:7:2] # 1,3,5 sliced_1 = booster[2:8:2] # 2,4,6 predt_0 = sliced_0.predict(dtrain, output_margin=True) predt_1 = sliced_1.predict(dtrain, output_margin=True) merged = predt_0 + predt_1 - 0.5 single = booster[1:7].predict(dtrain, output_margin=True) np.testing.assert_allclose(merged, single, atol=1e-6)
with open(file_path.strip()) as f: doc = f.read().splitlines() with open(file_path.strip(), 'rb') as f: tokens = list(tokenize.tokenize(f.readline)) token_i = 0 doc_pos = (0, 0) df_i = 0 length = len(tokens) token_selection_start = 0 index_i = 0 completions = list(df.c) d_test = DMatrix(fe.X) predicted_prob = model.predict(d_test, output_margin=True) correct_prediction = 0 wrong_prediction = 0 not_available = 0 not_required = 0 def get_range_from_doc(start, end): result = [] ch = start[1] for line in range(max(start[0] - 1, 0), end[0]): end_ch = None if line != end[0] - 1 else end[1] result.append(doc[line][ch:end_ch]) ch = 0 return '\n'.join(result)