예제 #1
0
def cv(X_t, y_t, X_test, y_test):
    '''
    :type dataframes: will be using DMatrix, optimized datastructure offered from xgboost
    testing new APIs in xgboost with train/cv, to be used for hyperparameter tuning
    '''
    dtrain = DMatrix(X_t, label=y_t)
    dtest = DMatrix(X_test, label=y_test)
    params={"objective":"binary:logistic",
            'max_depth': 9,
            'min_child_weight': 2,
            'subsample': 0.8,
            'eta': 0.1,
            'alpha': 0.2,
            'lambda': 0.2,
            'eval_metric':'auc'}
    num_boost_round = 313 #model.best_iteration + 1
    #fit
    model = xgb.train(params,
              dtrain,
              num_boost_round=num_boost_round,
              evals=[(dtest,'Test')])

    return model

    '''
예제 #2
0
def main():
    p = get_cli_args(args)
    x_train, y_train, qid_train = load_svmlight_file(
        p.train.xgboost_train_path, query_id=True)  # pylint: disable=unbalanced-tuple-unpacking
    x_test, y_test, qid_test = load_svmlight_file(p.train.xgboost_test_path,
                                                  query_id=True)  # pylint: disable=unbalanced-tuple-unpacking
    x_train = x_train.todense()
    x_train = np.concatenate([
        x_train, x_train[:, -2] / x_train[:, 2], x_train[:, -1] / x_train[:, 4]
    ], 1)
    x_test = x_test.todense()
    x_test = np.concatenate(
        [x_test, x_test[:, -2] / x_test[:, 2], x_test[:, -1] / x_test[:, 4]],
        1)
    train_dmatrix = DMatrix(x_train, y_train)
    test_dmatrix = DMatrix(x_test, y_test)
    train_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_train)])
    test_dmatrix.set_group([len(list(g)) for __, g in groupby(qid_test)])
    params = {
        'objective': 'rank:pairwise',
        'eval_metric': ['error', 'map@1'],
        'tree_method': 'exact',
        'eta': 0.1,
        'gamma': 1.0,
        'min_child_weight': 0.1,
        'max_depth': 6
    }
    xgb_model = xgb.train(params,
                          train_dmatrix,
                          num_boost_round=100,
                          evals=[(test_dmatrix, 'validation')])
    xgb_train_str = items_to_str(_.omit(params, 'objective',
                                        'eval_metric').items(),
                                 sort_by=itemgetter(0))
    xgb_model.save_model(xgb_train_str + '_model.xgb')
예제 #3
0
def train_ranking():
    train_group_list, train_data_list, train_target_list = data_generation({})
    test_group_list, test_data_list, test_target_list = train_group_list, train_data_list, train_target_list
    eval_group_list, eval_data_list, eval_target_list = train_group_list, train_data_list, train_target_list

    xgbTrain = DMatrix(np.asmatrix(train_data_list), label=train_target_list)
    xgbTrain.set_group(train_group_list)

    xgbEval = DMatrix(np.asmatrix(eval_data_list), label=eval_target_list)
    xgbEval.set_group(eval_group_list)
    evallist = [(xgbTrain, 'train'), (xgbEval, 'eval')]

    rankModel = train(xgb_rank_params2,
                      xgbTrain,
                      num_boost_round=50,
                      evals=evallist)
    rankModel.save_model('xgb.model')
    loaded_model = xgb.Booster(model_file='xgb.model')
    xgbTest = DMatrix(np.asmatrix(test_data_list), label=test_target_list)
    xgbTest.set_group(test_group_list)
    results = loaded_model.predict(xgbTest)

    with open('results.txt', mode='w', encoding='utf-8') as f:
        for item in results:
            f.write(str(item) + '\n')
예제 #4
0
def to_dmatrix(data, labels=None):
    if type(data) in [pd.DataFrame, pd.Series]:
        if labels != None and type(labels) in [pd.DataFrame, pd.Series]:
            return DMatrix(data.values, labels.values)
        else:
            return DMatrix(data.values)
    else:
        if labels != None:
            return DMatrix(data, labels)
        else:
            return DMatrix(data)
예제 #5
0
    def test_dmatrix_creator(self):

        # This function acts as a pseudo-itertools.chain()
        def row_tup_iter(data):
            pdf = pd.DataFrame(data)
            yield pdf

        # Standard testing DMatrix creation
        expected_features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
        expected_labels = np.array([1, 0] * 100)
        expected_dmatrix = DMatrix(data=expected_features,
                                   label=expected_labels)

        data = {
            "values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
            "label": [1, 0] * 100,
        }
        output_dmatrix = _convert_partition_data_to_dmatrix(
            [pd.DataFrame(data)],
            has_weight=False,
            has_validation=False,
            has_base_margin=False,
        )
        # You can't compare DMatrix outputs, so the only way is to predict on the two seperate DMatrices using
        # the same classifier and making sure the outputs are equal
        model = XGBClassifier()
        model.fit(expected_features, expected_labels)
        expected_preds = model.get_booster().predict(expected_dmatrix)
        output_preds = model.get_booster().predict(output_dmatrix)
        self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))

        # DMatrix creation with weights
        expected_weight = np.array([0.2, 0.8] * 100)
        expected_dmatrix = DMatrix(data=expected_features,
                                   label=expected_labels,
                                   weight=expected_weight)

        data["weight"] = [0.2, 0.8] * 100
        output_dmatrix = _convert_partition_data_to_dmatrix(
            [pd.DataFrame(data)],
            has_weight=True,
            has_validation=False,
            has_base_margin=False,
        )

        model.fit(expected_features,
                  expected_labels,
                  sample_weight=expected_weight)
        expected_preds = model.get_booster().predict(expected_dmatrix)
        output_preds = model.get_booster().predict(output_dmatrix)
        self.assertTrue(np.allclose(expected_preds, output_preds, atol=1e-3))
예제 #6
0
def main():
    #  Import training data
    x_train, y_train, qid_train = load_svmlight_file("hn.train", query_id=True)
    x_valid, y_valid, qid_valid = load_svmlight_file("hn.vali", query_id=True)
    x_test, y_test, qid_test = load_svmlight_file("hn.test", query_id=True)

    group_train = group_qid(qid_train)
    group_valid = group_qid(qid_valid)
    group_test = group_qid(qid_test)

    train_dmatrix = DMatrix(x_train, y_train)
    valid_dmatrix = DMatrix(x_valid, y_valid)
    test_dmatrix = DMatrix(x_test)

    train_dmatrix.set_group(group_train)
    valid_dmatrix.set_group(group_valid)
    test_dmatrix.set_group(group_test)

    # Train Xgboost with basic parameters
    params = {'objective': 'rank:pairwise', 'eta': 0.1,
              # 'gamma': 1.0,
              # 'min_child_weight': 0.1,
              'max_depth': 3}
    params['eval_metric'] = ['ndcg@1', 'ndcg@3', 'ndcg@5', 'ndcg@10']
    xgb_model = xgb.train(params, train_dmatrix, num_boost_round=4,
                          evals=[(valid_dmatrix, 'validation')])
    pred = xgb_model.predict(test_dmatrix)

    data_predict = regroup_results(group_test, pred, y_test)

    # Testing random sample
    # Simple debug function that print algolia results and predictions
    def print_random_sample(line):
        prevsum = sum(group_test[:line])
        print('Algolia clicks are: {}'.format(y_test[prevsum:prevsum + group_test[line]]))
        print('Predictions are: {}'.format(pred[prevsum:prevsum + group_test[line]]))
        print('Xgboost clicks are: {}'.format(data_predict[line]))
    print_random_sample(1)

    print('> Mean reciprocal rank is : {}'.format(
        mean_reciprocal_rank(data_predict)))
    print('> Mean average position is : {}'.format(
        mean_average_precision(data_predict)))

    # nDCG
    for i in [1, 3, 5, 10]:
        ndcg_ = []
        for query in data_predict:
            ndcg_.append(ndcg_at_k(query, i))
        print('> nDCG@{} is : {}'.format(i, pd.Series(ndcg_).mean()))
예제 #7
0
def train(model_file):
    #  This script demonstrate how to do ranking with xgboost.train
    x_train, y_train = load_svmlight_file("mq2008.train")
    x_valid, y_valid = load_svmlight_file("mq2008.vali")
    x_test, y_test = load_svmlight_file("mq2008.test")

    group_train = []
    with open("mq2008.train.group", "r", encoding="utf8") as f:
        data = f.readlines()
        for line in data:
            group_train.append(int(line.split("\n")[0]))

    group_valid = []
    with open("mq2008.vali.group", "r", encoding="utf8") as f:
        data = f.readlines()
        for line in data:
            group_valid.append(int(line.split("\n")[0]))

    group_test = []
    with open("mq2008.test.group", "r", encoding="utf8") as f:
        data = f.readlines()
        for line in data:
            group_test.append(int(line.split("\n")[0]))

    train_dmatrix = DMatrix(x_train, y_train)
    valid_dmatrix = DMatrix(x_valid, y_valid)
    test_dmatrix = DMatrix(x_test)

    train_dmatrix.set_group(group_train)
    valid_dmatrix.set_group(group_valid)

    params = {
        'objective': 'rank:pairwise',
        'eta': 0.01,
        'gamma': 1.0,
        'min_child_weight': 0.1,
        'max_depth': 8
    }
    xgb_model = xgb.train(params,
                          train_dmatrix,
                          num_boost_round=4,
                          evals=[(valid_dmatrix, 'validation')])
    pred = xgb_model.predict(test_dmatrix)
    xgb_model.dump_model(model_file + ".txt")
    xgb_model.save_model(model_file)
    # save figures
    plt.clf()
    xgb.plot_importance(xgb_model)
    plt.savefig('feature_importance.png', dpi=800, format='png')
예제 #8
0
파일: data.py 프로젝트: jayzed82/xgboost
def _convert_partition_data_to_dmatrix(
    partition_data_iter,
    has_weight,
    has_validation,
    has_base_margin,
    dmatrix_kwargs=None,
):
    # pylint: disable=too-many-locals, unbalanced-tuple-unpacking
    dmatrix_kwargs = dmatrix_kwargs or {}
    # if we are not using external storage, we use the standard method of parsing data.
    train_val_data = _prepare_train_val_data(partition_data_iter, has_weight,
                                             has_validation, has_base_margin)
    if has_validation:
        (
            train_x,
            train_y,
            train_w,
            train_b_m,
            val_x,
            val_y,
            val_w,
            val_b_m,
        ) = train_val_data
        training_dmatrix = DMatrix(
            data=train_x,
            label=train_y,
            weight=train_w,
            base_margin=train_b_m,
            **dmatrix_kwargs,
        )
        val_dmatrix = DMatrix(
            data=val_x,
            label=val_y,
            weight=val_w,
            base_margin=val_b_m,
            **dmatrix_kwargs,
        )
        return training_dmatrix, val_dmatrix

    train_x, train_y, train_w, train_b_m = train_val_data
    training_dmatrix = DMatrix(
        data=train_x,
        label=train_y,
        weight=train_w,
        base_margin=train_b_m,
        **dmatrix_kwargs,
    )
    return training_dmatrix
예제 #9
0
def mknfold(X_train, y_train, nfold, param, evals=(), features=None):
    '''
    Makes n folds in input data.

    Parameters
    ----------
    X_train : pandas.DataFrame
        X data to be trained
    y_train : pandas.DataFrame
        y data to be trained
    nfold : int
        Number of folds in CV.
    param : dict
        Booster params
    evals : list
        Evaluation metrics to be watches in CV.
    features : list
        features selected to be trained

    Returns
    -------
    ret : list
        list of CVPack objects containing the dmatrix training, testing, and
        list of parameters and metrics to use for every fold
    wt_list : list
        list of weights for each fold. This is the size of each fold
    '''
    if not features:
        features = X_train.columns
    out_idset, wt_list = bin_fold(X_train, nfold)
    in_idset = [
        np.concatenate([out_idset[i] for i in range(nfold) if k != i])
        for k in range(nfold)
    ]
    evals = list(evals)
    ret = []
    for k in range(nfold):
        # perform the slicing using the indexes determined by the above methods
        x_train_snip = X_train.loc[in_idset[k]][features]
        y_train_snip = X_train.loc[in_idset[k]]['encoded_target']
        x_test_snip = X_train.loc[out_idset[k]][features]
        y_test_snip = X_train.loc[out_idset[k]]['encoded_target']
        dtrain = DMatrix(x_train_snip, label=y_train_snip)
        dtest = DMatrix(x_test_snip, label=y_test_snip)
        tparam = param
        plst = list(tparam.items()) + [('eval_metric', itm) for itm in evals]
        ret.append(CVPack(dtrain, dtest, plst))
    return ret, wt_list
예제 #10
0
    def test_external_storage(self):
        # Instantiating base data (features, labels)
        features = np.array([[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100)
        labels = np.array([1, 0] * 100)
        normal_dmatrix = DMatrix(features, labels)
        test_dmatrix = DMatrix(features)

        data = {
            "values": [[1.0, 2.0, 3.0], [0.0, 1.0, 5.5]] * 100,
            "label": [1, 0] * 100,
        }

        # Creating the dmatrix based on storage
        temporary_path = tempfile.mkdtemp()
        storage_dmatrix = _convert_partition_data_to_dmatrix(
            [pd.DataFrame(data)],
            has_weight=False,
            has_validation=False,
            has_base_margin=False,
        )

        # Testing without weights
        normal_booster = worker_train({}, normal_dmatrix)
        storage_booster = worker_train({}, storage_dmatrix)
        normal_preds = normal_booster.predict(test_dmatrix)
        storage_preds = storage_booster.predict(test_dmatrix)
        self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3))
        shutil.rmtree(temporary_path)

        # Testing weights
        weights = np.array([0.2, 0.8] * 100)
        normal_dmatrix = DMatrix(data=features, label=labels, weight=weights)
        data["weight"] = [0.2, 0.8] * 100

        temporary_path = tempfile.mkdtemp()
        storage_dmatrix = _convert_partition_data_to_dmatrix(
            [pd.DataFrame(data)],
            has_weight=True,
            has_validation=False,
            has_base_margin=False,
        )

        normal_booster = worker_train({}, normal_dmatrix)
        storage_booster = worker_train({}, storage_dmatrix)
        normal_preds = normal_booster.predict(test_dmatrix)
        storage_preds = storage_booster.predict(test_dmatrix)
        self.assertTrue(np.allclose(normal_preds, storage_preds, atol=1e-3))
        shutil.rmtree(temporary_path)
예제 #11
0
    def __classify(self, path):
        files = [self.parse_pe(path)]
        df = pd.DataFrame(files)
        df = df.drop(['sha256', 'size'], axis=1)
        sections = df['sections'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series)

        imports = df['import'].apply(pd.Series).stack().reset_index(level=1, drop=True).apply(pd.Series)
        imports = imports.reset_index().set_index(['index', 'dll'])
        imports = imports['symbols'].apply(pd.Series).stack().reset_index(level=2, drop=True).to_frame('import').reset_index().set_index('index')

        join = sections.join(imports).fillna(0)

        join['SectionName'] = join['SectionName'].astype('str')
        join['dll'] = join['dll'].astype('str')
        join['import'] = join['import'].astype('str')

        string_columns = ['SectionName', 'dll', 'import']
        matrix = self.ohe.transform(join[string_columns])

        index = join.index
        rows = []
        for i in index.unique():
            select = index.slice_indexer(start=i, end=i)
            rows.append(csr_matrix(matrix[select].sum(axis=0)))

        join_encoded = pd.DataFrame(data={'matrix':rows})

        df = df.drop(['sections', 'import'], axis=1)
        df = df.join(join_encoded)

        X = df.apply(lambda x: hstack((x.drop('matrix').astype('int64').values, x['matrix'])).T, axis=1)
        X = hstack(X.values).T
        X = X.todok().toarray()
        return self.booster.predict(DMatrix(X))[0]
예제 #12
0
    def trainModel(self, train_x, train_y):
        #train a xgboost model
        self.xgb_clf = xgb.XGBClassifier(nthread=self.xgb_nthread)
        self.xgb_clf.fit(train_x,
                         train_y,
                         eval_metric=self.xgb_eval_metric,
                         eval_set=[(train_x, train_y)])

        xgb_eval_result = self.xgb_clf.evals_result()
        print 'XGB_train eval_result:', xgb_eval_result

        train_x_mat = DMatrix(train_x)

        train_xgb_pred_mat = self.xgb_clf.get_booster().predict(train_x_mat,
                                                                pred_leaf=True)

        self.one_hot_encoder = OneHotEncoder()
        train_lr_feature_mat = self.one_hot_encoder.fit_transform(
            train_xgb_pred_mat)
        print 'train_mat:', train_lr_feature_mat.shape
        #train a LR model
        self.lr_clf = LR()
        self.lr_clf.fit(train_lr_feature_mat, train_y)

        self.init_flag = True

        pickle.dump(self.xgb_clf, file(self.xgb_model_name, 'wb'), True)
        pickle.dump(self.lr_clf, file(self.lr_model_name, 'wb'), True)
        pickle.dump(self.one_hot_encoder,
                    file(self.one_hot_encoder_model_name, 'wb'), True)

        print 'Train xgboost and lr model done'
예제 #13
0
파일: dmatrix.py 프로젝트: ueshin/mars
    def get_xgb_dmatrix(tup):
        from xgboost import DMatrix

        data, label, weight, missing, feature_names, feature_types = tup
        return DMatrix(data, label=label, missing=missing, weight=weight,
                       feature_names=feature_names, feature_types=feature_types,
                       nthread=-1)
예제 #14
0
 def predict(self, smiles, get_features=get_fp, use_tqdm=False):
     canonical_smiles = []
     invalid_smiles = []
     if use_tqdm:
         pbar = tqdm(range(len(smiles)))
     else:
         pbar = range(len(smiles))
     for i in pbar:
         sm = smiles[i]
         if use_tqdm:
             pbar.set_description("Calculating predictions...")
         try:
             sm = Chem.MolToSmiles(Chem.MolFromSmiles(sm, sanitize=False))
             if len(sm) == 0:
                 invalid_smiles.append(sm)
             else:
                 canonical_smiles.append(sm)
         except:
             invalid_smiles.append(sm)
     if len(canonical_smiles) == 0:
         return canonical_smiles, [], invalid_smiles
     prediction = []
     x, _, _ = get_features(canonical_smiles, sanitize=False)
     x = DMatrix(x)
     for i in range(len(self.models)):
         y_pred = self.models[i].predict(x)
         if self.transformer is not None:
             y_pred = self.transformer.inverse_transform(y_pred)
         prediction.append(y_pred)
     prediction = np.array(prediction)
     prediction = np.mean(prediction, axis=0)
     return canonical_smiles, prediction, invalid_smiles
예제 #15
0
    def Predict(self, request: predict_pb2.PredictRequest,
                context: grpc.RpcContext):
        model_name = request.model_spec.name
        if model_name not in self.model_map:
            raise PythieServingException(
                f'Unknown model: {model_name}. This pythie-serving instance can only '
                f'serve one of the following: {",".join(self.model_map.keys())}'
            )

        model_dict = self.model_map[model_name]

        features_names = model_dict['feature_names']
        feature_rows = []
        for feature_name in features_names:
            if feature_name not in request.inputs:
                raise PythieServingException(
                    f'{feature_name} not set in the predict request')
            nd_array = make_ndarray_from_tensor(request.inputs[feature_name])
            if len(nd_array.shape) != 2 or nd_array.shape[1] != 1:
                raise PythieServingException(
                    'All input vectors should be 1D tensor')
            feature_rows.append(nd_array)

        if len(set(len(l) for l in feature_rows)) != 1:
            raise PythieServingException(
                'All input vectors should have the same length')

        model = model_dict['model']
        d_matrix = DMatrix(np.concatenate(feature_rows, axis=1),
                           feature_names=features_names)
        outputs = model.predict(d_matrix, ntree_limit=model.best_ntree_limit)
        outputs = outputs.reshape((outputs.size, 1))  # return 1D tensor
        return outputs
예제 #16
0
파일: core.py 프로젝트: ijpulidos/kinoml
    def to_xgboost(self, **kwargs):
        from xgboost import DMatrix

        dmatrix = DMatrix(self.to_numpy(**kwargs))
        ## TODO: Uncomment when XGB observation models are implemented
        # dmatrix.observation_model = self.observation_model(backend="xgboost", loss="mse")
        return dmatrix
예제 #17
0
    def update(self, Xtrain, ytrain, Xval, yval, scoring, n_iterations):
        dtrain = DMatrix(data=Xtrain, label=ytrain)

        early_stop_callback = early_stop()

        if not (self.env['earlier_stop']):
            for i in range(n_iterations - self.model.n_estimators):
                # note:
                # this is a get, but the internal booster in XGBClassifier is also updated
                # add unit test for controle if future updates
                self.model.get_booster().update(
                    dtrain, iteration=self.model.n_estimators)
                self.model.n_estimators += 1

                score = scoring(self, Xval, yval)

                if score > self.env['best_score']:
                    self.env['best_score'] = score
                    self.env['best_iteration'] = self.model.n_estimators
                try:
                    early_stop_callback(env=self.env,
                                        score=score,
                                        iteration=self.model.n_estimators)
                except EarlyStopException:
                    print('Update Stopped Earlier! @ {} instead of {}'.format(
                        self.model.n_estimators, n_iterations))
                    self.env['earlier_stop'] = True
                    break
예제 #18
0
    def test_xgboost_booster_classifier_reg(self):
        x, y = make_classification(n_classes=2,
                                   n_features=5,
                                   n_samples=100,
                                   random_state=42,
                                   n_informative=3)
        y = y.astype(np.float32) + 0.567
        x_train, x_test, y_train, _ = train_test_split(x,
                                                       y,
                                                       test_size=0.5,
                                                       random_state=42)

        data = DMatrix(x_train, label=y_train)
        model = train(
            {
                'objective': 'reg:squarederror',
                'n_estimators': 3,
                'min_child_samples': 1
            }, data)
        model_onnx = convert_xgboost(
            model, 'tree-based classifier',
            [('input', FloatTensorType([None, x.shape[1]]))])
        dump_data_and_model(
            x_test.astype(np.float32),
            model,
            model_onnx,
            allow_failure=
            "StrictVersion(onnx.__version__) < StrictVersion('1.3.0')",
            basename="XGBBoosterReg")
예제 #19
0
    def train_model(self, train_x, train_y):
        """
        train a xgboost model
        :param train_x:
        :param train_y:
        :return:
        """
        self.xgb_clf = xgb.XGBClassifier()
        self.xgb_clf.fit(train_x, train_y, eval_metric=self.xgb_eval_metric,
                         eval_set=[(train_x, train_y)])
        xgb_eval_result = self.xgb_clf.evals_result()
        print('Xgb train eval result:', xgb_eval_result)

        train_x_mat = DMatrix(train_x)
        # get boost tree leaf info
        train_xgb_pred_mat = self.xgb_clf.get_booster().predict(train_x_mat,
                                                                pred_leaf=True)
        # begin one-hot encoding
        self.one_hot_encoder = OneHotEncoder()
        train_lr_feature_mat = self.one_hot_encoder.fit_transform(train_xgb_pred_mat)
        print('train_mat:', train_lr_feature_mat.shape)

        # lr
        self.lr_clf = LogisticRegression()
        self.lr_clf.fit(train_lr_feature_mat, train_y)
        self.init = True

        # dump xgboost+lr model
        with open(self.xgb_model_name, 'wb') as f1, open(self.lr_model_name, 'wb') as f2, \
            open(self.one_hot_model_name, 'wb') as f3:
            pickle.dump(self.xgb_clf, f1, True)
            pickle.dump(self.lr_clf, f2, True)
            pickle.dump(self.one_hot_encoder, f3, True)
예제 #20
0
    def _prepare_data(self,
                      back_training_feat,
                      thigh_training_feat,
                      back_temp,
                      thigh_temp,
                      labels,
                      samples_pr_window,
                      sampling_freq,
                      train_overlap):
        back_training_feat = temp_feature_util.segment_acceleration_and_calculate_features(back_training_feat,
                                                                                           temp=back_temp,
                                                                                           samples_pr_window=samples_pr_window,
                                                                                           sampling_frequency=sampling_freq,
                                                                                           overlap=train_overlap)

        thigh_training_feat = temp_feature_util.segment_acceleration_and_calculate_features(thigh_training_feat,
                                                                                            temp=thigh_temp,
                                                                                            samples_pr_window=samples_pr_window,
                                                                                            sampling_frequency=sampling_freq,
                                                                                            overlap=train_overlap)

        labels = temp_feature_util.segment_labels(labels, samples_pr_window=samples_pr_window, overlap=train_overlap)

        labels = self._one_hot_encode(labels)


        both_features = np.hstack((back_training_feat, thigh_training_feat))

        # We need to convert the dataframe into a DMatrix
        dmatrix = DMatrix(both_features, label=labels)

        return dmatrix, labels
예제 #21
0
    def fit(self, train_x, train_y):
        """
        train a xgboost_lr model
        :param train_x:
        :param train_y:
        :return:
        """
        from xgboost import DMatrix
        self.xgb_clf.fit(train_x,
                         train_y,
                         eval_metric=self.xgb_eval_metric,
                         eval_set=[(train_x, train_y)])
        xgb_eval_result = self.xgb_clf.evals_result()
        print('Xgb train eval result:', xgb_eval_result)

        train_x_mat = DMatrix(train_x)
        # get boost tree leaf info
        train_xgb_pred_mat = self.xgb_clf.get_booster().predict(train_x_mat,
                                                                pred_leaf=True)
        print(train_xgb_pred_mat)

        # begin one-hot encoding
        train_lr_feature_mat = self.one_hot_encoder.fit_transform(
            train_xgb_pred_mat)
        print('train_mat:', train_lr_feature_mat.shape)
        print('train_mat array:', train_lr_feature_mat.toarray())

        # lr
        self.lr_clf.fit(train_lr_feature_mat, train_y)
        self.init = True

        model = [self.xgb_clf, self.lr_clf, self.one_hot_encoder]
        # dump xgboost+lr model
        with open(self.model_save_path, 'wb') as f:
            pickle.dump(model, f, True)
예제 #22
0
 def train(self, x, y, model=None):
     self.bst = xgb.train(
         params          = vars(self.hparams.bst),
         dtrain          = DMatrix(x, label=y),
         num_boost_round = self.hparams.num_rounds,
         xgb_model       = model
     )
예제 #23
0
파일: xblr.py 프로젝트: yuepaang/PY-NLP
    def train(self, train_x, train_y):
        """Train a xgboost_lr model
        
        Arguments:
            train_x {[type]} -- [description]
            train_y {[type]} -- [description]
        """
        self.xgb_clf.fit(train_x,
                         train_y,
                         eval_metric=self.xgb_eval_metric,
                         eval_set=[(train_x, train_y)])
        xgb_eval_result = self.xgb_clf.evals_result()
        print("train eval result: ", xgb_eval_result)

        train_x_mat = DMatrix(train_x)
        # get boost tree leaf info
        train_xgb_pred_mat = self.xgb_clf.get_booster().predict(train_x_mat,
                                                                pred_leaf=True)
        print(train_xgb_pred_mat)

        train_lr_feature_mat = self.one_hot_encoder.fit_transform(
            train_xgb_pred_mat)
        print('train_mat:', train_lr_feature_mat.shape)
        print('train_mat array:', train_lr_feature_mat.toarray())

        # lr
        self.lr_clf.fit(train_lr_feature_mat, train_y)
        self.init = True
예제 #24
0
	def trainModel(self,train_x,train_y):
		#train a xgboost model
		sys.stdout.flush()
		self.xgb_clf = xgb.XGBClassifier(nthread = self.xgb_nthread)
		self.xgb_clf.fit(train_x,train_y,eval_metric = self.xgb_eval_metric,
				eval_set = [(train_x,train_y)])

		xgb_eval_result = self.xgb_clf.evals_result()
		print 'XGB_train eval_result:',xgb_eval_result
		sys.stdout.flush()

		train_x_mat = DMatrix(train_x)
		print 'get boost tree leaf info...'	
		train_xgb_pred_mat = self.xgb_clf.get_booster().predict(train_x_mat,
				pred_leaf = True)
		print 'get boost tree leaf info done\n'
		
		print 'begin one-hot encoding...'
		self.one_hot_encoder = OneHotEncoder()
		train_lr_feature_mat = self.one_hot_encoder.fit_transform(train_xgb_pred_mat)
		print 'one-hot encoding done!\n\n'
		print 'train_mat:',train_lr_feature_mat.shape
		sys.stdout.flush()
		#train a LR model
		self.lr_clf = LR()
		self.lr_clf.fit(train_lr_feature_mat,train_y)
		
		self.init_flag = True
		
		print 'dump xgboost+lr model..'
		pickle.dump(self.xgb_clf,file(self.xgb_model_name,'wb'),True)
		pickle.dump(self.lr_clf,file(self.lr_model_name,'wb'),True)
		pickle.dump(self.one_hot_encoder,file(self.one_hot_encoder_model_name,'wb'),True)

		print 'Train xgboost and lr model done'
예제 #25
0
    def apply(self, X, ntree_limit=0):
        """Return the predicted leaf every tree for each sample.

        Parameters
        ----------
        X : array_like, shape=[n_samples, n_features]
            Input features matrix.

        ntree_limit : int
            Limit number of trees in the prediction; defaults to 0 (use all trees).

        Returns
        -------
        X_leaves : array_like, shape=[n_samples, n_trees]
            For each datapoint x in X and for each tree, return the index of the
            leaf x ends up in. Leaves are numbered within
            ``[0; 2**(self.max_depth+1))``, possibly with gaps in the numbering.
        """
        sizes, group_indices, X_features, _, _ = _preprare_data_in_groups(X)
        test_dmatrix = DMatrix(X_features, missing=self.missing)
        test_dmatrix.set_group(sizes)
        X_leaves = self.get_booster().predict(test_dmatrix,
                                              pred_leaf=True,
                                              ntree_limit=ntree_limit)
        revert_group_indices = np.arange(
            len(group_indices))[group_indices.argsort()]
        X_leaves = X_leaves[revert_group_indices, :]
        return X_leaves
def predict_xgboost_answers(xgb_model):
    # запись прогноза посчитанной модели на тестовой выборке в виде ((кодекс, статья), вероятность)
    load_tfidf_1 = TFIDF.load(os.path.join(PATH_TO_TF_IDF, 'tf_idf_1'))
    x_test, y_test = sklearn.datasets.load_svmlight_file(
        os.path.join(PATH_TO_LEARNING_TO_RANK, 'x_test.txt'))
    group_test = []
    with open(os.path.join(PATH_TO_LEARNING_TO_RANK, "gr_test.txt"),
              "r",
              encoding="utf-8") as f:
        data = f.readlines()
        for line in data:
            group_test.append(int(line.split("\n")[0]))
    test_dmatrix = DMatrix(x_test)
    test_dmatrix.set_group(group_test)
    pred = xgb_model.predict(test_dmatrix)
    prediction_answer = []
    for i, p in enumerate(pred):
        prediction_answer.append(
            (load_tfidf_1.num_to_num_dict[i % CNT_ARTICLES], p))
    predict_file = os.path.join(PATH_TO_LEARNING_TO_RANK,
                                'prediction_file.txt')
    if os.path.exists(predict_file):
        os.remove(predict_file)
    f = open(predict_file, 'w+', encoding="utf-8")
    predictions = [str(pred) for pred in prediction_answer]
    f.write('\n'.join(predictions))
    f.close()
    def predict(self, x, **kwargs):
        """
        Perform prediction for a batch of inputs.

        :param x: Test set.
        :type x: `np.ndarray`
        :return: Array of predictions of shape `(nb_inputs, nb_classes)`.
        :rtype: `np.ndarray`
        """
        from xgboost import Booster, XGBClassifier
        from art.utils import to_categorical

        # Apply preprocessing
        x_preprocessed, _ = self._apply_preprocessing(x, y=None, fit=False)

        if isinstance(self._model, Booster):
            from xgboost import DMatrix

            train_data = DMatrix(x_preprocessed, label=None)
            predictions = self._model.predict(train_data)
            y_prediction = np.asarray([line for line in predictions])
            if len(y_prediction.shape) == 1:
                y_prediction = to_categorical(labels=y_prediction,
                                              nb_classes=self.nb_classes())
        elif isinstance(self._model, XGBClassifier):
            y_prediction = self._model.predict_proba(x_preprocessed)

        # Apply postprocessing
        y_prediction = self._apply_postprocessing(preds=y_prediction,
                                                  fit=False)

        return y_prediction
예제 #28
0
def predict_xgboost_answers(xgb_model):
    # запись прогноза посчитанной модели на тестовой выборке в виде ((кодекс, статья), вероятность)
    features = pd.read_csv(f"{PATH_TO_LEARNING_TO_RANK}/x_test.csv", sep=',')
    x_test = features.drop(['doc_id', 'is_rel', '7'], axis=1)
    group_test = []
    with open(os.path.join(PATH_TO_LEARNING_TO_RANK, "gr_test.txt"),
              "r",
              encoding="utf-8") as f:
        data = f.readlines()
        for line in data:
            group_test.append(int(line.split("\n")[0]))

    test_dmatrix = DMatrix(x_test)
    test_dmatrix.set_group(group_test)

    pred = xgb_model.predict(test_dmatrix)
    corpus = SimpleCorp.load("codexes_corp_articles",
                             os.path.join(PATH_TO_FILES, "corp"))
    prediction_answer = []
    for p, doc_id in zip(
            pred,
            list(corpus.corpus.keys()) * (len(pred) // CNT_ARTICLES)):
        prediction_answer.append((doc_id, p))
    predict_file = os.path.join(PATH_TO_LEARNING_TO_RANK,
                                'prediction_file.txt')
    if os.path.exists(predict_file):
        os.remove(predict_file)
    f = open(predict_file, 'w+', encoding="utf-8")
    predictions = [str(pred) for pred in prediction_answer]
    f.write('\n'.join(predictions))
    f.close()
예제 #29
0
파일: xgboost.py 프로젝트: gth158a/eli5
def _prediction_feature_weights(xgb, X, feature_names, xgb_feature_names):
    """ For each target, return score and numpy array with feature weights
    on this prediction, following an idea from
    http://blog.datadive.net/interpreting-random-forests/
    """
    # XGBClassifier does not have pred_leaf argument, so use booster
    booster = xgb.booster()  # type: Booster
    leaf_ids, = booster.predict(DMatrix(X, missing=xgb.missing),
                                pred_leaf=True)
    xgb_feature_names = {f: i for i, f in enumerate(xgb_feature_names)}
    tree_dumps = booster.get_dump(with_stats=True)
    assert len(tree_dumps) == len(leaf_ids)

    target_feature_weights = partial(_target_feature_weights,
                                     feature_names=feature_names,
                                     xgb_feature_names=xgb_feature_names)
    n_targets = _xgb_n_targets(xgb)
    if n_targets > 1:
        # For multiclass, XGBoost stores dumps and leaf_ids in a 1d array,
        # so we need to split them.
        scores_weights = [
            target_feature_weights(
                leaf_ids[target_idx::n_targets],
                tree_dumps[target_idx::n_targets],
            ) for target_idx in range(n_targets)
        ]
    else:
        scores_weights = [target_feature_weights(leaf_ids, tree_dumps)]
    return scores_weights
예제 #30
0
    def score(self, pred_contribs = False):
        model = self.model.fit_model
        scoring_data = self.data.modeling_data

        missing_cols = setdiff(self.model.train_columns, list(scoring_data.columns))
        extra_cols = setdiff(list(scoring_data.columns), self.model.train_columns)

        # print('Missing cols: ' + ', '.join(missing_cols))
        # print('Extra cols: ' + ', '.join(extra_cols))

        for col in missing_cols:
            if '__' in col:
                scoring_data[col] = 0
            else:
                scoring_data[col] = nan

        try:
            scoring_data = scoring_data.drop(extra_cols, axis = 1)
            print('Dropping ' + ', '.join(extra_cols))
        except:
            pass

        scoring_data = scoring_data[self.model.train_columns]
        xgb_data = DMatrix(scoring_data, label = self.data.target)

        if pred_contribs:

            contribs = model.predict(xgb_data, pred_contribs = True)
            self.contribs = pd.DataFrame.from_records(contribs, columns = list(scoring_data.columns) + ['bias'])

        self.preds = model.predict(xgb_data)