Python Booster 예제들, xgboost.Booster Python 예제들

예제 #1

0

파일 보기

파일: app.py 프로젝트: lakshmi-girija-m/Fraud-Detection

def get_results(X):
    categoricals = X.select_dtypes(include='object')
    categoricals = categoricals.astype(str)
    categoricals = categoricals.apply(label.fit_transform)
    label_encoding = categoricals['country']
    categoricals.drop(['country'], axis=1, inplace=True)
    X_one = enc.transform(categoricals)
    encoded_data = pd.DataFrame(X_one.todense())
    encoded_data.reset_index(drop=True, inplace=True)
    categoricals.reset_index(drop=True, inplace=True)

    original_numeric = X.select_dtypes(include='number')
    original_numeric.reset_index(drop=True, inplace=True)

    X = pd.concat([original_numeric, encoded_data, label_encoding],
                  axis=1).values
    Xp = pca.transform(X)

    clf = XGBClassifier()
    booster = Booster()
    booster.load_model('xgb.model')
    clf._Booster = booster
    classes = clf.predict_proba(Xp)
    y_pred = [0 if c[0] > 0.5 else 1 for c in classes]

    return y_pred

예제 #2

0

파일 보기

    def predict(self, booster: xgb.Booster, **kwargs):
        """
        Run local XGBoost prediction.

        Parameters
        ----------
        booster : xgboost.Booster
            A trained booster.
        **kwargs : dict
            Other parameters for `xgboost.Booster.predict`.

        Returns
        -------
        tuple
            Pair of IP address of caller and pandas.DataFrame
            with partial prediction result.
        """
        local_dpredict = self._dpredict
        booster.set_param({"nthread": self._nthreads})

        s = time.time()

        predictions = pandas.DataFrame(
            booster.predict(local_dpredict["dmatrix"], **kwargs),
            index=local_dpredict["index"],
        )
        LOGGER.info(f"Local prediction time: {time.time() - s} s")

        return get_node_ip_address(), predictions

예제 #3

0

파일 보기

def load_saved_attributes():
    global model

    model = XGBRegressor()
    booster = Booster()
    booster.load_model('./ny_taxi_fare')
    model._Booster = booster

예제 #4

0

파일 보기

    def analyze(self, event):
        array_list = [
            "lepJet_llpdnnx_-1_isLLP_QMU_QQMU",
            "lepJet_llpdnnx_0_isLLP_QMU_QQMU",
            "lepJet_llpdnnx_1_isLLP_QMU_QQMU",
            "lepJet_llpdnnx_2_isLLP_QMU_QQMU", "dimuon_mass", "dimuon_deltaR",
            "lepJet_pt", "lepJet_eta", "lepJet_deltaR", "MET_pt", "MET_phi",
            "looseMuons_pt", "looseMuons_eta", "looseMuons_dxy",
            "tightMuons_pt", "tightMuons_eta", "tightMuons_dxy"
        ]
        data = pd.DataFrame(data={
            "lepJet_llpdnnx_-1_isLLP_QMU_QQMU":
            getattr(event, "lepJet_llpdnnx_-1_isLLP_QMU_QQMU"),
            "lepJet_llpdnnx_0_isLLP_QMU_QQMU":
            event.lepJet_llpdnnx_0_isLLP_QMU_QQMU,
            "lepJet_llpdnnx_1_isLLP_QMU_QQMU":
            event.lepJet_llpdnnx_1_isLLP_QMU_QQMU,
            "lepJet_llpdnnx_2_isLLP_QMU_QQMU":
            event.lepJet_llpdnnx_2_isLLP_QMU_QQMU,
            "dimuon_mass":
            event.dimuon_mass,
            "dimuon_deltaR":
            event.dimuon_deltaR,
            "lepJet_pt":
            event.lepJet_pt,
            "lepJet_eta":
            event.lepJet_eta,
            "lepJet_deltaR":
            event.lepJet_deltaR,
            "MET_pt":
            event.MET_pt,
            "MET_phi":
            event.MET_phi,
            "looseMuons_pt":
            event.looseMuons_pt,
            "looseMuons_eta":
            event.looseMuons_eta,
            "looseMuons_dxy":
            event.looseMuons_dxy,
            "tightMuons_pt":
            event.tightMuons_pt,
            "tightMuons_eta":
            event.tightMuons_eta,
            "tightMuons_dxy":
            event.tightMuons_dxy,
        },
                            columns=array_list,
                            index=[0])

        model = XGBClassifier()
        booster = Booster()
        #model._le = LabelEncoder().fit([1])
        booster.load_model(self.modelPath)
        booster.feature_names = array_list
        model._Booster = booster
        bdt_score = model.predict_proba(data)
        setattr(event, "bdt_score", bdt_score[:, 1])
        return True

예제 #5

0

파일 보기

파일: dataset_tester.py 프로젝트: timataran/django-sklearn_v2

 def get_model(cls, algorithm_name: str, model_path: str):
     if algorithm_name == 'xgboost':
         model = xgb.XGBClassifier()
         booster = Booster()
         booster.load_model(model_path)
         model._Booster = booster
     else:
         model = joblib.load(model_path)
     return model

예제 #6

0

파일 보기

파일: whereCond.py 프로젝트: rohitshantarampatil/sql-nlp

def test(x,modelFile):
    model = Booster() #init model
    model.load_model(modelFile) # load data
    maps=np.load("Map.npy",allow_pickle=True)
    x_enc=encode([x])
    y_enc=model.predict(DMatrix(x_enc))
    y_pred=np.argmax(y_enc)
    inverseMap=maps.item().get("inverseMap")
    y_hat=inverseMap[y_pred]
    print(y_hat)

예제 #7

0

파일 보기

    def predict(self, booster: xgb.Booster, *args, **kwargs):
        local_dpredict = self._dpredict
        booster.set_param({"nthread": self._nthreads})

        s = time.time()
        predictions = [
            booster.predict(X, *args, **kwargs) for X in local_dpredict
        ]
        LOGGER.info(f"Local prediction time: {time.time() - s} s")
        return np.concatenate(predictions)

예제 #8

0

파일 보기

def train_xgb(X, y, params, save_path=None, save_path_booster=None):

    # the threshold is not handled by XGB interface
    params, binary_threshold = _parse_param_and_delete(params,
                                                       'binary_threshold', .5)

    # n_jobs is handled by XGB SKL interface
    params = _parse_param_and_keep(params,
                                   name='n_jobs',
                                   default=min(max_cpu_count(), 24))

    X = np.asarray(X)
    y = np.asarray(y).flatten()

    if not tuple(np.sort(np.unique(y))) == (0, 1):
        raise NotImplementedError(
            'XGB Wrapper currently only support biinary classification.')

    # Fit the model
    model = XGBClassifier(use_label_encoder=False, )
    model = clone(model)
    model.set_params(**params)

    logging.info('Training...')
    model.fit(
        X,
        y,
        # early_stopping_rounds=10,
        verbose=True,
    )
    # Save and re-load (feature-agnostic model)
    temp_file = f'temp-{time.time()}-{random.random()}.bin'
    model.get_booster().save_model(temp_file)
    booster = Booster(model_file=temp_file)
    os.remove(temp_file)

    if binary_threshold == 'auto':
        p_ = booster.predict(DMatrix(X))
        p_ = np.sort(p_)
        binary_threshold = p_[int((y == 0).sum())]

    logging.info(f'Using a binary_threshold = {binary_threshold}')

    # Wrap
    model = XGBClassifierSKLWrapper(booster,
                                    features=X.shape[1],
                                    threshold=binary_threshold)

    # Save
    if save_path is not None:
        save_pickle(model, save_path)
    if save_path_booster is not None:
        save_pickle(model.get_booster(), save_path_booster)
    return model

예제 #9

0

파일 보기

파일: xgboost_ray.py 프로젝트: yangl235/modin

    def predict(self, booster: xgb.Booster, **kwargs):
        local_dpredict = self._dpredict
        booster.set_param({"nthread": self._nthreads})

        s = time.time()
        predictions = [
            pandas.DataFrame(booster.predict(X, **kwargs))
            for X in local_dpredict
        ]
        LOGGER.info(f"Local prediction time: {time.time() - s} s")
        return predictions if len(predictions) > 1 else predictions[0]

예제 #10

0

파일 보기

파일: mri_engine.py 프로젝트: xiabofei/python_details

def create_predictor_infos():
    word_index = {}
    n_tokens = 0
    with open('../data/output/word_frequency.pkl', 'rb') as f:
        word_frequency = cPickle.load(f)
        assert type(word_frequency) == dict
        for k, v in sorted(word_frequency.items(), key=lambda x: x[1]):
            if v > THRESHOLD_FREQ:
                word_index[k] = n_tokens
                n_tokens += 1
    bst = Booster()
    bst.load_model('../data/model/xgboost_model.model')
    return word_index, n_tokens, bst

예제 #11

0

파일 보기

    def predict(self, booster: xgb.Booster, **kwargs):
        local_dpredict = self._dpredict
        booster.set_param({"nthread": self._nthreads})

        s = time.time()

        predictions = pandas.DataFrame(
            booster.predict(local_dpredict["dmatrix"], **kwargs),
            index=local_dpredict["index"],
        )
        LOGGER.info(f"Local prediction time: {time.time() - s} s")

        return get_node_ip_address(), predictions

예제 #12

0

파일 보기

    def test_it_can_override_an_existing_one(self) -> None:
        station_id = uuid.uuid4()

        first = StationAvailabilityAlgorithm(
            station_id, DataFrame(['data_1', 'frame_1', 'test_1']), Booster())
        self._repository.save(first)

        override = StationAvailabilityAlgorithm(
            station_id, DataFrame(['data_2', 'frame_2', 'test_2']), Booster())
        self._repository.save(override)

        self.assertEqual(self._repository.find_by_station_id(station_id),
                         override)

예제 #13

0

파일 보기

def xgb_latest() -> (Booster, Dict[str, pandas.Categorical]):
    base_path = '/var/opt/pcsml/devel/training_data/dumps/debug004/2017-12-27T18-30-59'

    model = Booster()
    model.load_model(os.path.join(base_path, 'model_2017-12-27T18-30-59.xgb'))

    with gzip.open(
            os.path.join(
                base_path,
                'model_2017-12-27T18-30-59_column_categories.pickle.gz'),
            'rb') as f:
        column_categories = pickle.load(f)

    return model, column_categories

예제 #14

0

파일 보기

파일: XGB多因子策略.py 프로젝트: jyonsi/mycollection

def select_stocks(context, data):
    #clf = pickle.load(BytesIO(read_file('xgb_factors_model_ZZ800_D.model')))
    #file = read_file('xgb_factors_model_ZZ800_D.model')
    #clf = Booster.load_model(fname = BytesIO(read_file('xgb_factors_model_ZZ800_D.model')))
    with open('temp', 'wb') as f:
        f.write(read_file('xgb_factors.model'))  #储存一个临时文件,进程结束后清理
    clf = Booster(model_file='temp')
    #clf = Booster.load_model(fname = 'temp')
    industry_old_code = ['801010','801020','801030','801040','801050','801080','801110','801120','801130','801140','801150',\
                    '801160','801170','801180','801200','801210','801230']
    industry_new_code = ['801010','801020','801030','801040','801050','801080','801110','801120','801130','801140','801150',\
                    '801160','801170','801180','801200','801210','801230','801710','801720','801730','801740','801750',\
                   '801760','801770','801780','801790','801880','801890']
    starttime = datetime.datetime.now()
    date = context.previous_date
    #获取行业因子数据
    print('获取数据的日期：', date)
    '''
    if datetime.datetime.strptime(date,"%Y-%m-%d").date()<datetime.date(2014,2,21):
        industry_code=industry_old_code
    else:
    '''
    industry_code = industry_new_code
    stockList = get_stock('ZZ800', date)
    factor_origl_data = get_factor_data(stockList, date)
    factor_solve_data = data_preprocessing(factor_origl_data, stockList,
                                           industry_code, date)
    endtime = datetime.datetime.now()
    print('取数运行时长：', int((endtime - starttime).seconds / 60), '分钟')
    test_feature_or = factor_solve_data.copy()
    test_feature = np.array(test_feature_or)
    # 模型预测
    test_predict = clf.predict(DMatrix(test_feature_or))
    test_sample_predict = pd.DataFrame(data=test_predict,
                                       index=test_feature_or.index,
                                       columns=[
                                           'XGB_predict_0', 'XGB_predict_1',
                                           'XGB_predict_2', 'XGB_predict_3',
                                           'XGB_predict_4', 'XGB_predict_5',
                                           'XGB_predict_6', 'XGB_predict_7',
                                           'XGB_predict_8', 'XGB_predict_9',
                                           'XGB_predict_10', 'XGB_predict_11'
                                       ])
    #test_sample_predict['XGB_predict_0_and_1'] = test_sample_predict['XGB_predict_0'] + test_sample_predict['XGB_predict_1']
    test_sample_predict = test_sample_predict.sort_values(by='XGB_predict_0',
                                                          ascending=False)
    stock_list = test_sample_predict.index.values.tolist()
    stock_list = stock_list[:g.buy_stock_count]
    return stock_list

예제 #15

0

파일 보기

파일: memsql_udf.py 프로젝트: memsql/memsql-xgboost

def upload_xgb_to_memsql(xgb: Booster,
                         conn: Connection,
                         udf_name: str,
                         func=F.SIGMOID,
                         feature_names: List[str] = None,
                         allow_overwrite: bool = False) -> None:
    if feature_names:
        xgb.feature_names = feature_names
    trees = split_trees(xgb.trees_to_dataframe())
    sqls = [tree_to_func_def(udf_name, allow_overwrite, t) for t in trees]
    sqls.append(
        tree_to_main_func(udf_name, allow_overwrite, trees, xgb.feature_names,
                          func))
    for s in sqls:
        assert 1 == conn.query(s)

예제 #16

0

파일 보기

파일: ml.py 프로젝트: juanifarias95/ml_tech_interview

    def test_model(self, model: xgb.Booster, df: pd.DataFrame) -> (str, float):
        num_cols = [
            "shipping_free", "price", "accepts_mercadopago",
            "automatic_relist", "initial_quantity", "sold_quantity",
            "available_quantity", "quantity"
        ]

        cat_cols = [
            c for c in df.columns if c not in num_cols and c not in ("target")
        ]

        df = self.feature_engineer(df, cat_cols)

        features = [f for f in df.columns if f not in ("target")]

        for col in features:
            if col not in num_cols:
                lbl = LabelEncoder()
                lbl.fit(df[col])
                df.loc[:, col] = lbl.transform(df[col])

        X = df.drop(["target"], axis=1).values
        y = df.target.values

        preds = model.predict_proba(X)[:, 1]

        auc = metrics.roc_auc_score(y, preds)

        logger.info(f"AUC Test = {auc}")

예제 #17

0

파일 보기

def load_or_create(objective='multi:softprob',
                   max_depth=2,
                   seed=4242,
                   eval_metric='merror',
                   num_class=4520,
                   num_feature=256,
                   **kwargs):

    if from_scratch:
        print_info('Creating XGB Boosted Tree')
        params = {
            'updater': 'grow_gpu',
            'predictor': 'gpu_predictor',
            'tree_method': 'gpu_hist',
            'eval_metric': eval_metric,
            'objective': objective,
            'num_class': num_class,
            'max_depth': max_depth,
            'seed': seed,
            'num_feature': num_feature
        }

        params = {**params, **kwargs}

        model = Booster(params, )
    else:
        model = load_model()

    return model

예제 #18

0

파일 보기

파일: predict.py 프로젝트: ak-gupta/nbaspa

    def _run_xgboost(model: xgb.Booster, data: pd.DataFrame) -> pd.DataFrame:
        """Retrieve the win probability.

        Parameters
        ----------
        model : xgb.Booster
            The fitted XGBoost model.
        data : pd.DataFrame
            The input dataset to be evaluated.

        Returns
        -------
        np.ndarray
            The updated dataset.
        """
        # First, get the partial hazard values
        hazard = model.predict(
            xgb.DMatrix(data[META["static"] + META["dynamic"]]))
        # Get the cumulative probability
        c0 = interpolate_at_times(model.cumulative_hazard_,
                                  data["stop"].values)
        new = data.copy()
        new[META["survival"]] = 1 - np.exp(-(c0 * hazard))

        return new

예제 #19

0

파일 보기

파일: xgboost_utils.py 프로젝트: bmmalone/pyllars

def xgbooster_predict_proba(booster: xgb.Booster,
                            d_x: xgb.DMatrix) -> np.ndarray:
    """ Simulate the `predict_proba` interface from sklearn
    
    This function will only work as expected if `booster` has been
    training using the `binary:logistic` loss.
    
    Parameters
    ----------
    booster : xgboost.Booster
        The trained booster
        
    d_x : xgboost.DMatrix
        The dataset
        
    Returns
    -------
    y_proba_pred : numpy.ndarray
        The probabilistic predictions. The shape of the array
        is (n_row, 2).
    """
    y_score = booster.predict(d_x)
    y_false = 1 - y_score
    size = (d_x.num_row(), 2)

    y_probas_pred = np.zeros(size)
    y_probas_pred[:, 0] = y_false
    y_probas_pred[:, 1] = y_score

    return y_probas_pred

예제 #20

0

파일 보기

def load_xgb_model(fname):
    """ Load a XGBoost model that was saved as a file with
        the HyperXGBClassifier.save method.
        
        The model is span on two files:
            
            * The first file contains the model saved with the Booster class,
            this file have no extension.
            
            * The second file contains the parameters used to create the model,
            this file have the extension '.p'.
            
        Parameters
        ----------
        fname : path 
                The file name without extension.
        """
    from xgboost import Booster
    params = pickle.load(open(fname + '.p', "rb"))
    n_classes = params['meta']['n_classes']
    param_map = params['param_map']
    model = HyperXGBClassifier(**param_map)
    model.set_n_labels(n_classes - 1)
    y = [i for i in range(n_classes)]
    model.set_le(y)
    model._Booster = Booster(model_file=fname)
    return model

예제 #21

0

파일 보기

파일: xgb_quantile.py 프로젝트: alliander-opensource/short-term-forecasting

    def get_feature_importances_from_booster(cls,
                                             booster: Booster) -> np.ndarray:
        """Gets feauture importances from a XGB booster.
            This is based on the feature_importance_ property defined in:
            https://github.com/dmlc/xgboost/blob/master/python-package/xgboost/sklearn.py

        Args:
            booster(Booster): Booster object,
            most of the times the median model (quantile=0.5) is preferred

        Returns:
            (np.ndarray) with normalized feature importances

        """

        # Get score
        score = booster.get_score(importance_type="gain")

        # Get feature names from booster
        feature_names = booster.feature_names

        # Get importance
        feature_importance = [score.get(f, 0.0) for f in feature_names]
        # Convert to array
        features_importance_array = np.array(feature_importance,
                                             dtype=np.float32)

        total = features_importance_array.sum()  # For normalizing
        if total == 0:
            return features_importance_array
        return features_importance_array / total  # Normalize

예제 #22

0

파일 보기

    def test_it_can_not_find_one_by_station_id_if_it_does_not_exist(
            self) -> None:
        station_availability_algorithm = StationAvailabilityAlgorithm(
            uuid.uuid4(), DataFrame(['data', 'frame', 'test']), Booster())

        self._repository.save(station_availability_algorithm)

        self.assertIsNone(self._repository.find_by_station_id(uuid.uuid4()))

예제 #23

0

파일 보기

파일: test_survival.py 프로젝트: yupbank/xgboost

 def after_iteration(
     self, model: xgb.Booster,
     epoch: int,
     evals_log: xgb.callback.TrainingCallback.EvalsLog
 ):
     y_pred = model.predict(dmat)
     acc = np.sum(np.logical_and(y_pred >= y_lower, y_pred <= y_upper)/len(X))
     acc_rec.append(acc)
     return False

예제 #24

0

파일 보기

파일: main.py 프로젝트: amogkam/xgboost_ray

    def predict(self, model: xgb.Booster, data: RayDMatrix, **kwargs):
        _set_omp_num_threads()

        if data not in self._data:
            self.load_data(data)
        local_data = self._data[data]

        predictions = model.predict(local_data, **kwargs)
        return predictions

예제 #25

0

파일 보기

파일: xgb_util.py 프로젝트: dfaivre/python-ml-poc-2018

def merge_labeled_weight_importance(model: Booster, label_encoder: OneHotLabelEncoder) -> Dict:
    f_imp = model.get_score(importance_type='weight')

    merged: Dict[str, int] = {}
    for f in f_imp:
        src_feature = label_encoder.source_column(f)
        merged[src_feature] = merged.get(src_feature, 0) + f_imp[f]

    return merged

예제 #26

0

파일 보기

    def test_it_can_find_one_by_station_id(self) -> None:
        station_id = uuid.uuid4()
        station_availability_algorithm = StationAvailabilityAlgorithm(
            station_id, DataFrame(['data', 'frame', 'test']), Booster())

        self._repository.save(station_availability_algorithm)

        self.assertEqual(self._repository.find_by_station_id(station_id),
                         station_availability_algorithm)

예제 #27

0

파일 보기

def merge_labeled_weight_importance(
        model: Booster,
        dummy_col_sep=categorical_util.DUMMY_COL_SEP) -> Dict[str, int]:
    f_imp = model.get_score(importance_type='weight')

    merged: Dict[str, int] = {}
    for f in f_imp:
        src_feature = categorical_util.get_source_name_from_dummy(
            f, dummy_col_sep)
        merged[src_feature] = merged.get(src_feature, 0) + f_imp[f]

    return merged

예제 #28

0

파일 보기

파일: SFAirbnbPricePrediction.py 프로젝트: rishabh7795/San-Francisco-Airbnb-Price-Prediction

def load_saved_attributes():

    global host_response_time_values
    global neighbourhood_values
    global property_type_values
    global room_type_values
    global cancellation_policy_values
    global model

    with open("columns.json", "r") as f:
        resp = json.load(f)
        host_response_time_values = resp["host_response_time"]
        neighbourhood_values = resp["neighbourhood"]
        property_type_values = resp["property_type"]
        room_type_values = resp["room_type"]
        cancellation_policy_values = resp["cancellation_policy"]

    model = XGBRegressor()
    booster = Booster()
    booster.load_model('airbnb_price_predictor')
    model._Booster = booster

예제 #29

0

파일 보기

파일: predxgboost.py 프로젝트: Nance-Lab/diff_predictor

def load(filename):
    '''
    Loads in an xgboost model from the given file location
    Parameters
    ----------
    filename : string
        path of model file to be loaded
    Returns
    -------
    booster : xgboost.Booster()
        model that is loaded
    metadata : dict
        parameter metadata for model in the form of json data.
        Use get_params() function to use in model prediction.
    '''
    booster = Booster({'nthread': 4})
    # Check if model file exists as it has been writen by user.
    # If not, add model_ to filename like it designated in save()
    if not path.exists(filename):
        model_file = filename.split('/')
        model_file[-1] = 'model_' + model_file[-1]
        model_file = '/'.join(model_file)
    else:
        model_file = filename
    config_file = model_file.replace('model_', 'config_')
    booster.load_model(model_file)
    with open(config_file, 'r', encoding='utf-8') as f:
        config = f.read()
        config = json.loads(config)
    booster.load_config(config)
    metadata = json.loads(config)
    return booster, metadata

예제 #30

0

파일 보기

    def from_model(
        cls,
        booster: xgboost.Booster,
        *,
        path: os.PathLike,
        preprocessor: Optional["Preprocessor"] = None,
    ) -> "XGBoostCheckpoint":
        """Create a :py:class:`~ray.air.checkpoint.Checkpoint` that stores an XGBoost
        model.

        Args:
            booster: The XGBoost model to store in the checkpoint.
            path: The directory where the checkpoint will be stored.
            preprocessor: A fitted preprocessor to be applied before inference.

        Returns:
            An :py:class:`XGBoostCheckpoint` containing the specified ``Estimator``.

        Examples:
            >>> from ray.train.xgboost import XGBoostCheckpoint
            >>> import xgboost
            >>>
            >>> booster = xgboost.Booster()
            >>> checkpoint = XGBoostCheckpoint.from_model(booster, path=".")  # doctest: +SKIP # noqa: E501

            You can use a :py:class:`XGBoostCheckpoint` to create an
            :py:class:`~ray.train.xgboost.XGBoostPredictor` and preform inference.

            >>> from ray.train.xgboost import XGBoostPredictor
            >>>
            >>> predictor = XGBoostPredictor.from_checkpoint(checkpoint)  # doctest: +SKIP # noqa: E501
        """
        booster.save_model(os.path.join(path, MODEL_KEY))

        if preprocessor:
            save_preprocessor_to_dir(preprocessor, path)

        checkpoint = cls.from_directory(path)

        return checkpoint

예제 #31

0

파일 보기

파일: utils.py 프로젝트: vishalbelsare/ray

def to_air_checkpoint(
    path: str,
    booster: xgboost.Booster,
    preprocessor: Optional["Preprocessor"] = None,
) -> Checkpoint:
    """Convert a pretrained model to AIR checkpoint for serve or inference.

    Args:
        path: The directory path where model and preprocessor steps are stored to.
        booster: A pretrained xgboost model.
        preprocessor: A fitted preprocessor. The preprocessing logic will
            be applied to serve/inference.
    Returns:
        A Ray Air checkpoint.
    """
    booster.save_model(os.path.join(path, MODEL_KEY))

    if preprocessor:
        save_preprocessor_to_dir(preprocessor, path)

    checkpoint = Checkpoint.from_directory(path)

    return checkpoint

예제 #32

0

파일 보기

파일: clf_xgboost.py 프로젝트: chrinide/kaggle_otto_group

def my_train_xgboost(params, dtrain, num_boost_round=10, evals=(), obj=None, 
                     feval=None, early_stopping_rounds=None, seed=0, 
                     rt_eta=1.0006, rt_ssp=1.0006, rt_clb=1.0006, 
                     rt_dpt=1.0001):
    """
    Train a booster with given parameters.

    Parameters
    ----------
    params : dict
        Booster params.
    dtrain : DMatrix
        Data to be trained.
    num_boost_round: int
        Number of boosting iterations.
    watchlist : list of pairs (DMatrix, string)
        List of items to be evaluated during training, this allows user to watch
        performance on the validation set.
    obj : function
        Customized objective function.
    feval : function
        Customized evaluation function.
    early_stopping_rounds: int
        Activates early stopping. Validation error needs to decrease at least
        every <early_stopping_rounds> round(s) to continue training.
        Requires at least one item in evals.
        If there's more than one, will use the last.
        Returns the model from the last iteration (not the best one).
        If early stopping occurs, the model will have two additional fields:
        bst.best_score and bst.best_iteration.

    Returns
    -------
    booster : a trained booster model
    """
    eta = params['eta']   

    ssp = params['subsample']
    clb = params['colsample_bytree']
    
#    rt_eta=np.random.random()
    rt_ssp=np.random.uniform(0.1,0.9)
    rt_clb=np.random.uniform(0.1,0.9)
    

    evals = list(evals)
    bst = Booster(params, [dtrain] + [d[0] for d in evals], seed=seed)

    if not early_stopping_rounds:
        for i in range(num_boost_round):
            bst.set_param({'eta': eta})
            bst.set_param({'subsample': ssp})
            bst.set_param({'colsample_bytree': clb})
            eta = eta * rt_eta
#            ssp = ssp * rt_ssp
#            clb = clb * rt_clb
            ssp = rt_ssp
            clb = rt_clb
            bst.update(dtrain, i, obj)
            if len(evals) != 0:
                bst_eval_set = bst.eval_set(evals, i, feval)
                if isinstance(bst_eval_set, string_types):
                    sys.stderr.write(bst_eval_set + '\n')
                else:
                    sys.stderr.write(bst_eval_set.decode() + '\n')
        return bst

    else:
        # early stopping

        if len(evals) < 1:
            raise ValueError('For early stopping you need at least on set in evals.')

        sys.stderr.write("Will train until {} error hasn't decreased in {} rounds.\n".format(evals[-1][1], early_stopping_rounds))

        # is params a list of tuples? are we using multiple eval metrics?
        if type(params) == list:
            if len(params) != len(dict(params).items()):
                raise ValueError('Check your params. Early stopping works with single eval metric only.')
            params = dict(params)

        # either minimize loss or maximize AUC/MAP/NDCG
        maximize_score = False
        if 'eval_metric' in params:
            maximize_metrics = ('auc', 'map', 'ndcg')
            if filter(lambda x: params['eval_metric'].startswith(x), maximize_metrics):
                maximize_score = True

        if maximize_score:
            best_score = 0.0
        else:
            best_score = float('inf')

        best_msg = ''
        best_score_i = 0

        for i in range(num_boost_round):
            bst.set_param({'eta': eta})
            bst.set_param({'subsample': ssp})
            bst.set_param({'colsample_bytree': clb})
            eta = eta * rt_eta
#            ssp = ssp * rt_ssp
#            clb = clb * rt_clb
            ssp = rt_ssp
            clb = rt_clb
            bst.update(dtrain, i, obj)
            bst_eval_set = bst.eval_set(evals, i, feval)

            if isinstance(bst_eval_set, string_types):
                msg = bst_eval_set
            else:
                msg = bst_eval_set.decode()

            sys.stderr.write(msg + '\n')

            score = float(msg.rsplit(':', 1)[1])
            if (maximize_score and score > best_score) or \
                    (not maximize_score and score < best_score):
                best_score = score
                best_score_i = i
                best_msg = msg
            elif i - best_score_i >= early_stopping_rounds:
                sys.stderr.write("Stopping. Best iteration:\n{}\n\n".format(best_msg))
                bst.best_score = best_score
                bst.best_iteration = best_score_i
                return bst

        return bst