Exemplo n.º 1
0
def xgbooster_predict_proba(booster: xgb.Booster,
                            d_x: xgb.DMatrix) -> np.ndarray:
    """ Simulate the `predict_proba` interface from sklearn
    
    This function will only work as expected if `booster` has been
    training using the `binary:logistic` loss.
    
    Parameters
    ----------
    booster : xgboost.Booster
        The trained booster
        
    d_x : xgboost.DMatrix
        The dataset
        
    Returns
    -------
    y_proba_pred : numpy.ndarray
        The probabilistic predictions. The shape of the array
        is (n_row, 2).
    """
    y_score = booster.predict(d_x)
    y_false = 1 - y_score
    size = (d_x.num_row(), 2)

    y_probas_pred = np.zeros(size)
    y_probas_pred[:, 0] = y_false
    y_probas_pred[:, 1] = y_score

    return y_probas_pred
Exemplo n.º 2
0
    def _run_xgboost(model: xgb.Booster, data: pd.DataFrame) -> pd.DataFrame:
        """Retrieve the win probability.

        Parameters
        ----------
        model : xgb.Booster
            The fitted XGBoost model.
        data : pd.DataFrame
            The input dataset to be evaluated.

        Returns
        -------
        np.ndarray
            The updated dataset.
        """
        # First, get the partial hazard values
        hazard = model.predict(
            xgb.DMatrix(data[META["static"] + META["dynamic"]]))
        # Get the cumulative probability
        c0 = interpolate_at_times(model.cumulative_hazard_,
                                  data["stop"].values)
        new = data.copy()
        new[META["survival"]] = 1 - np.exp(-(c0 * hazard))

        return new
Exemplo n.º 3
0
    def predict(self, booster: xgb.Booster, **kwargs):
        """
        Run local XGBoost prediction.

        Parameters
        ----------
        booster : xgboost.Booster
            A trained booster.
        **kwargs : dict
            Other parameters for `xgboost.Booster.predict`.

        Returns
        -------
        tuple
            Pair of IP address of caller and pandas.DataFrame
            with partial prediction result.
        """
        local_dpredict = self._dpredict
        booster.set_param({"nthread": self._nthreads})

        s = time.time()

        predictions = pandas.DataFrame(
            booster.predict(local_dpredict["dmatrix"], **kwargs),
            index=local_dpredict["index"],
        )
        LOGGER.info(f"Local prediction time: {time.time() - s} s")

        return get_node_ip_address(), predictions
Exemplo n.º 4
0
 def after_iteration(
     self, model: xgb.Booster,
     epoch: int,
     evals_log: xgb.callback.TrainingCallback.EvalsLog
 ):
     y_pred = model.predict(dmat)
     acc = np.sum(np.logical_and(y_pred >= y_lower, y_pred <= y_upper)/len(X))
     acc_rec.append(acc)
     return False
Exemplo n.º 5
0
    def predict(self, model: xgb.Booster, data: RayDMatrix, **kwargs):
        _set_omp_num_threads()

        if data not in self._data:
            self.load_data(data)
        local_data = self._data[data]

        predictions = model.predict(local_data, **kwargs)
        return predictions
Exemplo n.º 6
0
def test(x,modelFile):
    model = Booster() #init model
    model.load_model(modelFile) # load data
    maps=np.load("Map.npy",allow_pickle=True)
    x_enc=encode([x])
    y_enc=model.predict(DMatrix(x_enc))
    y_pred=np.argmax(y_enc)
    inverseMap=maps.item().get("inverseMap")
    y_hat=inverseMap[y_pred]
    print(y_hat)
Exemplo n.º 7
0
    def predict(self, booster: xgb.Booster, *args, **kwargs):
        local_dpredict = self._dpredict
        booster.set_param({"nthread": self._nthreads})

        s = time.time()
        predictions = [
            booster.predict(X, *args, **kwargs) for X in local_dpredict
        ]
        LOGGER.info(f"Local prediction time: {time.time() - s} s")
        return np.concatenate(predictions)
Exemplo n.º 8
0
    def predict(self, booster: xgb.Booster, **kwargs):
        local_dpredict = self._dpredict
        booster.set_param({"nthread": self._nthreads})

        s = time.time()
        predictions = [
            pandas.DataFrame(booster.predict(X, **kwargs))
            for X in local_dpredict
        ]
        LOGGER.info(f"Local prediction time: {time.time() - s} s")
        return predictions if len(predictions) > 1 else predictions[0]
Exemplo n.º 9
0
def train_xgb(X, y, params, save_path=None, save_path_booster=None):

    # the threshold is not handled by XGB interface
    params, binary_threshold = _parse_param_and_delete(params,
                                                       'binary_threshold', .5)

    # n_jobs is handled by XGB SKL interface
    params = _parse_param_and_keep(params,
                                   name='n_jobs',
                                   default=min(max_cpu_count(), 24))

    X = np.asarray(X)
    y = np.asarray(y).flatten()

    if not tuple(np.sort(np.unique(y))) == (0, 1):
        raise NotImplementedError(
            'XGB Wrapper currently only support biinary classification.')

    # Fit the model
    model = XGBClassifier(use_label_encoder=False, )
    model = clone(model)
    model.set_params(**params)

    logging.info('Training...')
    model.fit(
        X,
        y,
        # early_stopping_rounds=10,
        verbose=True,
    )
    # Save and re-load (feature-agnostic model)
    temp_file = f'temp-{time.time()}-{random.random()}.bin'
    model.get_booster().save_model(temp_file)
    booster = Booster(model_file=temp_file)
    os.remove(temp_file)

    if binary_threshold == 'auto':
        p_ = booster.predict(DMatrix(X))
        p_ = np.sort(p_)
        binary_threshold = p_[int((y == 0).sum())]

    logging.info(f'Using a binary_threshold = {binary_threshold}')

    # Wrap
    model = XGBClassifierSKLWrapper(booster,
                                    features=X.shape[1],
                                    threshold=binary_threshold)

    # Save
    if save_path is not None:
        save_pickle(model, save_path)
    if save_path_booster is not None:
        save_pickle(model.get_booster(), save_path_booster)
    return model
Exemplo n.º 10
0
    def predict(self, booster: xgb.Booster, **kwargs):
        local_dpredict = self._dpredict
        booster.set_param({"nthread": self._nthreads})

        s = time.time()

        predictions = pandas.DataFrame(
            booster.predict(local_dpredict["dmatrix"], **kwargs),
            index=local_dpredict["index"],
        )
        LOGGER.info(f"Local prediction time: {time.time() - s} s")

        return get_node_ip_address(), predictions
Exemplo n.º 11
0
def select_stocks(context, data):
    #clf = pickle.load(BytesIO(read_file('xgb_factors_model_ZZ800_D.model')))
    #file = read_file('xgb_factors_model_ZZ800_D.model')
    #clf = Booster.load_model(fname = BytesIO(read_file('xgb_factors_model_ZZ800_D.model')))
    with open('temp', 'wb') as f:
        f.write(read_file('xgb_factors.model'))  #储存一个临时文件,进程结束后清理
    clf = Booster(model_file='temp')
    #clf = Booster.load_model(fname = 'temp')
    industry_old_code = ['801010','801020','801030','801040','801050','801080','801110','801120','801130','801140','801150',\
                    '801160','801170','801180','801200','801210','801230']
    industry_new_code = ['801010','801020','801030','801040','801050','801080','801110','801120','801130','801140','801150',\
                    '801160','801170','801180','801200','801210','801230','801710','801720','801730','801740','801750',\
                   '801760','801770','801780','801790','801880','801890']
    starttime = datetime.datetime.now()
    date = context.previous_date
    #获取行业因子数据
    print('获取数据的日期:', date)
    '''
    if datetime.datetime.strptime(date,"%Y-%m-%d").date()<datetime.date(2014,2,21):
        industry_code=industry_old_code
    else:
    '''
    industry_code = industry_new_code
    stockList = get_stock('ZZ800', date)
    factor_origl_data = get_factor_data(stockList, date)
    factor_solve_data = data_preprocessing(factor_origl_data, stockList,
                                           industry_code, date)
    endtime = datetime.datetime.now()
    print('取数运行时长:', int((endtime - starttime).seconds / 60), '分钟')
    test_feature_or = factor_solve_data.copy()
    test_feature = np.array(test_feature_or)
    # 模型预测
    test_predict = clf.predict(DMatrix(test_feature_or))
    test_sample_predict = pd.DataFrame(data=test_predict,
                                       index=test_feature_or.index,
                                       columns=[
                                           'XGB_predict_0', 'XGB_predict_1',
                                           'XGB_predict_2', 'XGB_predict_3',
                                           'XGB_predict_4', 'XGB_predict_5',
                                           'XGB_predict_6', 'XGB_predict_7',
                                           'XGB_predict_8', 'XGB_predict_9',
                                           'XGB_predict_10', 'XGB_predict_11'
                                       ])
    #test_sample_predict['XGB_predict_0_and_1'] = test_sample_predict['XGB_predict_0'] + test_sample_predict['XGB_predict_1']
    test_sample_predict = test_sample_predict.sort_values(by='XGB_predict_0',
                                                          ascending=False)
    stock_list = test_sample_predict.index.values.tolist()
    stock_list = stock_list[:g.buy_stock_count]
    return stock_list
Exemplo n.º 12
0
def predict(booster: xgb.Booster, X):
    '''A customized prediction function that converts raw prediction to
    target class.

    '''
    # Output margin means we want to obtain the raw prediction obtained from
    # tree leaf weight.
    predt = booster.predict(X, output_margin=True)
    out = np.zeros(kRows)
    for r in range(predt.shape[0]):
        # the class with maximum prob (not strictly prob as it haven't gone
        # through softmax yet so it doesn't sum to 1, but result is the same
        # for argmax).
        i = np.argmax(predt[r])
        out[r] = i
    return out
Exemplo n.º 13
0
def _generate_cumulative_hazard(
    model: xgb.Booster,
    train_data: pd.DataFrame,
    dtrain: xgb.DMatrix,
) -> np.ndarray:
    """Generate the cumulative hazard.

    Parameters
    ----------
    model : xgb.Booster
        The trained model.
    train_data : pd.DataFrame
        The training dataset.
    dtrain : xgb.DMatrix
        The training dataset.

    Returns
    -------
    np.ndarray
        The array output.
    """
    # First, get the partial hazard values
    hazard = model.predict(dtrain)
    # Get the unique failure times
    unique_death_times = np.unique(
        train_data.loc[train_data[META["event"]] == 1, "stop"].values)
    baseline_hazard_ = pd.DataFrame(
        np.zeros_like(unique_death_times),
        index=unique_death_times,
        columns=["baseline hazard"],
    )

    for t in unique_death_times:
        ix = (train_data["start"].values < t) & (t <=
                                                 train_data["stop"].values)

        events_at_t = train_data[META["event"]].values[ix]
        stops_at_t = train_data["stop"].values[ix]
        hazards_at_t = hazard[ix]

        deaths = events_at_t & (stops_at_t == t)

        death_counts = deaths.sum()
        baseline_hazard_.loc[t] = death_counts / hazards_at_t.sum()

    return baseline_hazard_.cumsum()
Exemplo n.º 14
0
def test_model(model: Booster, Xt, yt, test_indices):
    d_test = DMatrix(Xt, label=yt)
    random_successful = 0.
    model_successful = 0.
    prob_all = model.predict(d_test, True)

    start = 0
    for i in test_indices:
        end = i
        y = yt[start:end]
        if y.sum() != 1:
            logger.warning(f'Sum of y is not 1, {(y.sum(), start, end, y)}')
        random_successful += 1 / (end - start)

        prob = prob_all[start:end]
        if yt[start + prob.argmax()] > 0:
            model_successful += 1
        start = end

    return random_successful, model_successful
Exemplo n.º 15
0
    "like_pppprepost": [float(sys.argv[19])],
    "happiness": [float(sys.argv[20])],
    "love": [float(sys.argv[21])],
    "sadness": [float(sys.argv[22])],
    "travel": [float(sys.argv[23])],
    "food": [float(sys.argv[24])],
    "pet": [float(sys.argv[25])],
    "angry": [float(sys.argv[26])],
    "music": [float(sys.argv[27])],
    "party": [float(sys.argv[28])],
    "sport": [float(sys.argv[29])],
    "hashtags_pop_1": [0],
    "hashtags_pop_2": [0],
    "hashtags_pop_3": [0],
    "hashtags_pop_4": [0],
    "hashtags_pop_5": [0],
    "hashtags_pop_6": [0],
    "hashtags_pop_7": [0],
    "hashtags_pop_8": [0],
    "hashtags_pop_9": [0],
    "hashtags_pop_10": [0],
    "baseline": [float(sys.argv[30])]
}

df = pd.DataFrame.from_dict(input_item)
model = Booster()
model.load_model('Predictor/xgb_if_model_2020_v1.json')

output = model.predict(xgboost.DMatrix(df))
print(output)
Exemplo n.º 16
0
    def run_slice(self, booster: xgb.Booster, dtrain: xgb.DMatrix,
                  num_parallel_tree: int, num_classes: int,
                  num_boost_round: int):
        beg = 3
        end = 7
        sliced: xgb.Booster = booster[beg:end]
        assert sliced.feature_types == booster.feature_types

        sliced_trees = (end - beg) * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

        sliced_trees = sliced_trees // 2
        sliced = booster[beg:end:2]
        assert sliced_trees == len(sliced.get_dump())

        sliced = booster[beg:...]
        sliced_trees = (num_boost_round -
                        beg) * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

        sliced = booster[beg:]
        sliced_trees = (num_boost_round -
                        beg) * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

        sliced = booster[:end]
        sliced_trees = end * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

        sliced = booster[...:end]
        sliced_trees = end * num_parallel_tree * num_classes
        assert sliced_trees == len(sliced.get_dump())

        with pytest.raises(ValueError, match=r">= 0"):
            booster[-1:0]

        # we do not accept empty slice.
        with pytest.raises(ValueError):
            booster[1:1]
        # stop can not be smaller than begin
        with pytest.raises(ValueError, match=r"Invalid.*"):
            booster[3:0]
        with pytest.raises(ValueError, match=r"Invalid.*"):
            booster[3:-1]
        # negative step is not supported.
        with pytest.raises(ValueError, match=r".*>= 1.*"):
            booster[0:2:-1]
        # step can not be 0.
        with pytest.raises(ValueError, match=r".*>= 1.*"):
            booster[0:2:0]

        trees = [_ for _ in booster]
        assert len(trees) == num_boost_round

        with pytest.raises(TypeError):
            booster["wrong type"]
        with pytest.raises(IndexError):
            booster[:num_boost_round + 1]
        with pytest.raises(ValueError):
            booster[1, 2]  # too many dims
        # setitem is not implemented as model is immutable during slicing.
        with pytest.raises(TypeError):
            booster[...:end] = booster

        sliced_0 = booster[1:3]
        np.testing.assert_allclose(
            booster.predict(dtrain, iteration_range=(1, 3)),
            sliced_0.predict(dtrain))
        sliced_1 = booster[3:7]
        np.testing.assert_allclose(
            booster.predict(dtrain, iteration_range=(3, 7)),
            sliced_1.predict(dtrain))

        predt_0 = sliced_0.predict(dtrain, output_margin=True)
        predt_1 = sliced_1.predict(dtrain, output_margin=True)

        merged = predt_0 + predt_1 - 0.5  # base score.
        single = booster[1:7].predict(dtrain, output_margin=True)
        np.testing.assert_allclose(merged, single, atol=1e-6)

        sliced_0 = booster[1:7:2]  # 1,3,5
        sliced_1 = booster[2:8:2]  # 2,4,6

        predt_0 = sliced_0.predict(dtrain, output_margin=True)
        predt_1 = sliced_1.predict(dtrain, output_margin=True)

        merged = predt_0 + predt_1 - 0.5
        single = booster[1:7].predict(dtrain, output_margin=True)
        np.testing.assert_allclose(merged, single, atol=1e-6)
Exemplo n.º 17
0
    with open(file_path.strip()) as f:
        doc = f.read().splitlines()
    with open(file_path.strip(), 'rb') as f:
        tokens = list(tokenize.tokenize(f.readline))

    token_i = 0
    doc_pos = (0, 0)
    df_i = 0
    length = len(tokens)
    token_selection_start = 0
    index_i = 0

    completions = list(df.c)
    d_test = DMatrix(fe.X)
    predicted_prob = model.predict(d_test, output_margin=True)

    correct_prediction = 0
    wrong_prediction = 0
    not_available = 0
    not_required = 0

    def get_range_from_doc(start, end):
        result = []
        ch = start[1]
        for line in range(max(start[0] - 1, 0), end[0]):
            end_ch = None if line != end[0] - 1 else end[1]
            result.append(doc[line][ch:end_ch])
            ch = 0
        return '\n'.join(result)