Пример #1
0
class ModelNgbClassifier(Model):
    def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None):

        # ハイパーパラメータの設定
        params = dict(self.params)
        early_stopping_rounds = params.pop('early_stopping_rounds')

        self.model = NGBClassifier(**params)
        self.model.fit(tr_x.values,
                       tr_y.astype(int).values,
                       va_x.values,
                       va_y.astype(int).values,
                       early_stopping_rounds=early_stopping_rounds)

    def predict(self, te_x):
        return self.model.predict_proba(te_x.values)[:, 1]

    def save_model(self):
        model_path = os.path.join('../output/model',
                                  f'{self.run_fold_name}.model')
        os.makedirs(os.path.dirname(model_path), exist_ok=True)
        Data.dump(self.model, model_path)

    def load_model(self):
        model_path = os.path.join('../output/model',
                                  f'{self.run_fold_name}.model')
        self.model = Data.load(model_path)
Пример #2
0
def test_classification():
    from sklearn.datasets import load_breast_cancer
    from sklearn.metrics import roc_auc_score, log_loss
    data, target = load_breast_cancer(True)
    x_train, x_test, y_train, y_test = train_test_split(data,
                                                        target,
                                                        test_size=0.2,
                                                        random_state=42)
    ngb = NGBClassifier(Dist=Bernoulli, verbose=False)
    ngb.fit(x_train, y_train)
    preds = ngb.predict(x_test)
    score = roc_auc_score(y_test, preds)
    assert score >= 0.95

    preds = ngb.predict_proba(x_test)
    score = log_loss(y_test, preds)
    assert score <= 0.20

    score = ngb.score(x_test, y_test)
    assert score <= 0.20

    dist = ngb.pred_dist(x_test)
    assert isinstance(dist, Bernoulli)

    preds = ngb.dist_to_prediction(dist)
    score = roc_auc_score(y_test, preds)
    assert score >= 0.95
Пример #3
0
class NGBoost(BaseEstimator, ClassifierMixin):
    def __init__(self, **params):
        logger.info('Initializing NGBoost...')
        self.params_ = params
        self.classes_ = np.array([0, 1])

    def get_params(self, deep=True):
        return self.params_

    def _to_numpy(self, X):
        if isinstance(X, pd.DataFrame) or isinstance(X, pd.Series):
            try:
                return X.to_numpy()
            except:
                ValueError('There is error when converting to numpy')
        elif isinstance(X, np.ndarray):
            return X
        else:
            ValueError('X must be pandas DataFrame, Series or numpy ndarray')

    def fit(self, X, y, *args, **kwargs):
        logger.info(f'NGBoost, fit')
        logger.info(f'NGBoost, training data shape {X.shape}')
        logger.info(f'NGBoost, training label shape {y.shape}')

        X_np = self._to_numpy(X)
        y_np = self._to_numpy(y)
        y_np = y_np.astype(int)
        print(f'np.unique(y_np)')
        self.estimator_ = NGBClassifier(**self.params_)
        self.estimator_.fit(X_np, y_np)
        logger.info(f'NGBoost, done fit')
        return self

    def transform(self, X, *args, **kwargs):
        logger.info(f'NGBoost, transform')
        logger.info(f'NGBoost, transform, testing shape: {X.shape}')
        X_np = self._to_numpy(X)
        pred = self.estimator_.predict_proba(X_np)[:, 1].reshape(-1)
        logger.info(f'NGBoost, transform, predictions shape: {pred.shape}')
        logger.info(f'NGBoost, done transform')
        return pred

    def score(self, X, y, *args, **kwargs):
        return roc_auc_score(y, self.transform(X))

    def predict_proba(self, X, *args, **kwargs):
        logger.info(f'NGBoost, predict_proba')
        logger.info(f'NGBoost, predict_proba, testing shape: {X.shape}')
        X_np = self._to_numpy(X)
        pred = self.estimator_.predict_proba(X_np)
        logger.info(f'NGBoost, predict_proba, done')
        return pred
Пример #4
0
    def train(self, tr_x, tr_y, va_x=None, va_y=None, te_x=None):

        # ハイパーパラメータの設定
        params = dict(self.params)
        early_stopping_rounds = params.pop('early_stopping_rounds')

        self.model = NGBClassifier(**params)
        self.model.fit(tr_x.values,
                       tr_y.astype(int).values,
                       va_x.values,
                       va_y.astype(int).values,
                       early_stopping_rounds=early_stopping_rounds)
Пример #5
0
    def fit(self, X, y, *args, **kwargs):
        logger.info(f'NGBoost, fit')
        logger.info(f'NGBoost, training data shape {X.shape}')
        logger.info(f'NGBoost, training label shape {y.shape}')

        X_np = self._to_numpy(X)
        y_np = self._to_numpy(y)
        y_np = y_np.astype(int)
        print(f'np.unique(y_np)')
        self.estimator_ = NGBClassifier(**self.params_)
        self.estimator_.fit(X_np, y_np)
        logger.info(f'NGBoost, done fit')
        return self
Пример #6
0
def test_classification(breast_cancer_data):
    from sklearn.metrics import roc_auc_score, log_loss

    x_train, x_test, y_train, y_test = breast_cancer_data
    ngb = NGBClassifier(Dist=Bernoulli, verbose=False)
    ngb.fit(x_train, y_train)
    preds = ngb.predict(x_test)
    score = roc_auc_score(y_test, preds)

    # loose score requirement so it isn't failing all the time
    assert score >= 0.85

    preds = ngb.predict_proba(x_test)
    score = log_loss(y_test, preds)
    assert score <= 0.30

    score = ngb.score(x_test, y_test)
    assert score <= 0.30

    dist = ngb.pred_dist(x_test)
    assert isinstance(dist, Bernoulli)

    score = roc_auc_score(y_test, preds[:, 1])

    assert score >= 0.85
Пример #7
0
    def NGBoost(self, args):  ## Natural gradient Boosting

        logger.info("Running Natural Gradient Boosting ... ")
        ## https://stanfordmlgroup.github.io/ngboost/1-useage.html
        from ngboost.learners import default_tree_learner
        from ngboost.distns import k_categorical, Bernoulli  ##Classifier
        from ngboost.distns import Exponential, Normal, LogNormal  ## Regressor
        from ngboost.scores import MLE, LogScore, CRPScore

        ## Base Learner
        from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier

        # NGBoost
        ## Comment: If error with singular matrix, increase size of input data from 0.05 to 0.15
        if args.predictor.lower() == 'classifier':
            from ngboost import NGBClassifier as ngb
            learner = DecisionTreeRegressor(criterion='friedman_mse',
                                            max_depth=6,
                                            random_state=SEED)
            ngb = ngb(Base=learner,
                      n_estimators=2000,
                      Score=MLE,
                      Dist=Bernoulli,
                      random_state=SEED)

        elif args.predictor.lower() == 'regressor':
            from ngboost import NGBRegressor as ngb
            learner = DecisionTreeRegressor(criterion='friedman_mse',
                                            max_depth=3,
                                            random_state=SEED)
            ngb = ngb(Base=default_tree_learner,
                      Dist=Exponential,
                      Score=LogScore,
                      learning_rate=0.01,
                      minibatch_frac=0.6,
                      col_sample=0.6)

        ## Fit model
        ngb.fit(self.X_train, np.asarray(self.y_train).astype(int))

        ## Predict the labels
        self.y_pred = ngb.predict(self.X_data)

        if args.predictor.lower() == 'regressor':
            self.y_pred = logistic.cdf(self.y_pred)

        self.data['boosting_score'] = self.y_pred
        self.model = ngb
        return self
Пример #8
0
    def start(self):
        """ 01. Initialise the data paths and transformation functions.  """
        self.data_dir = '../data/raw_data'
        self.trans_primitives = ['weekday', 'hour', 'time_since_previous']
        self.agg_primitives = [
            'mean', 'max', 'min', 'std', 'count', 'percent_true', 'last',
            'time_since_last', 'mode'
        ]
        self.ignore_cols = [
            'num_contacts', 'num_referrals', 'num_successful_referrals'
        ]
        self.feature_windows = [10, 30, 60, 90]  #[10,20,30]
        self.max_feature_depth = 2

        # list of estimators to use
        self.estimators = [
            ('cbc', CatBoostClassifier()), ('lgbmc', LGBMClassifier()),
            ('gbc',
             GradientBoostingClassifier(validation_fraction=0.15,
                                        n_iter_no_change=50)),
            ('et', ExtraTreeClassifier()), ('abc', AdaBoostClassifier()),
            ('rfc', RandomForestClassifier()), ('bc', BaggingClassifier()),
            ('etc', ExtraTreesClassifier()), ('gnb', GaussianNB()),
            ('mlpc', MLPClassifier()), ('gpc', GaussianProcessClassifier()),
            ('dtc', DecisionTreeClassifier()),
            ('qda', QuadraticDiscriminantAnalysis()),
            ('lr', LogisticRegression()), ('knn3', KNeighborsClassifier(3)),
            ('knn6', KNeighborsClassifier(6)),
            ('knn12', KNeighborsClassifier(12)), ('nc', NearestCentroid()),
            ('rnc', RadiusNeighborsClassifier()), ('lp', LabelPropagation()),
            ('pac', PassiveAggressiveClassifier()), ('rc', RidgeClassifier()),
            ('sgdc', SGDClassifier()), ('svg', SVC()),
            ('ngbc', NGBClassifier(Dist=Bernoulli))
        ]
        self.next(self.load_raw_data)
Пример #9
0
class myNGBoostClassifier:
    def make(self , params  ):
        self.model =  NGBClassifier(**params  )
        return self

    def fit(self, xtrain, ytrain, xtest=None, ytest=None, fit_params={}):
        if type(xtrain) == pd.core.frame.DataFrame:
                xtrain = xtrain.values
                ytrain = ytrain.values
                if type(xtest) != type(None) and type(ytest) != type(None):
                    xtest = xtest.values
                    ytest = ytest.values
        if type(xtest) == type(None) or type(ytest) == type(None) :
            self.model.fit( xtrain , ytrain , **fit_params )
        else:
            self.model.fit( xtrain , ytrain , X_val = xtest , Y_val = ytest ,**fit_params )
        
    def predict(self , xs ):
        return self.model.predict(xs) 
        
    def predict_proba(self, xs):
        if len(xs.shape) == 1:
            return self.model.predict_proba(xs.reshape(1,-1))
        else:
            return self.model.predict_proba(xs)
Пример #10
0
def test_bernoulli(learner, breast_cancer_data: Tuple4Array):
    X_cls_train, X_cls_test, Y_cls_train, Y_cls_test = breast_cancer_data
    # test early stopping features
    # test other args, n_trees, LR, minibatching- args as fixture
    ngb = NGBClassifier(Dist=Bernoulli, Score=LogScore, Base=learner, verbose=False)
    ngb.fit(X_cls_train, Y_cls_train)
    y_pred = ngb.predict(X_cls_test)
    y_prob = ngb.predict_proba(X_cls_test)
    y_dist = ngb.pred_dist(X_cls_test)
Пример #11
0
	def test_bernoulli(self, learners, cls_data):
		X_cls_train, X_cls_test, Y_cls_train, Y_cls_test = cls_data
		for Learner in learners:
			# test early stopping features
			# test other args, n_trees, LR, minibatching- args as fixture
			ngb = NGBClassifier(Dist=Bernoulli, Score=LogScore, Base=Learner, verbose=False)
			ngb.fit(X_cls_train, Y_cls_train)
			y_pred = ngb.predict(X_cls_test)
			y_prob = ngb.predict_proba(X_cls_test)
			y_dist = ngb.pred_dist(X_cls_test)
Пример #12
0
def test_categorical(k: int, learner, breast_cancer_data: Tuple4Array):
    X_train, X_test, y_train, _ = breast_cancer_data
    dist = k_categorical(k)
    y_train = np.random.randint(0, k, (len(y_train)))
    # test early stopping features
    ngb = NGBClassifier(Dist=dist, Score=LogScore, Base=learner, verbose=False)
    ngb.fit(X_train, y_train)
    y_pred = ngb.predict(X_test)
    y_prob = ngb.predict_proba(X_test)
    y_dist = ngb.pred_dist(X_test)
Пример #13
0
	def test_categorical(self, learners, cls_data):
		X_cls_train, X_cls_test, Y_cls_train, Y_cls_test = cls_data
		for K in [2,4,7]:
			Dist = k_categorical(K)
			Y_cls_train = np.random.randint(0,K,(len(Y_cls_train)))

			for Learner in learners:
				# test early stopping features
				ngb = NGBClassifier(Dist=Dist, Score=LogScore, Base=Learner, verbose=False)
				ngb.fit(X_cls_train, Y_cls_train)
				y_pred = ngb.predict(X_cls_test)
				y_prob = ngb.predict_proba(X_cls_test)
				y_dist = ngb.pred_dist(X_cls_test)
Пример #14
0
from ngboost.distns import Bernoulli
from ngboost import NGBClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV, train_test_split

if __name__ == "__main__":
    X, y = load_breast_cancer(True)
    X_train, _, y_train, _ = train_test_split(X, y, test_size=0.2)

    param_grid = {
        'n_estimators': [200, 500],
        'minibatch_frac': [1.0, 0.5],
    }

    ngb = NGBClassifier(natural_gradient=True, verbose=False, Dist=Bernoulli)

    grid_search = GridSearchCV(ngb, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)
Пример #15
0
import numpy as np
from ngboost import NGBClassifier
from ngboost.distns import Bernoulli
from ngboost.learners import default_tree_learner
from ngboost.scores import MLE

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

if __name__ == "__main__":

    np.random.seed(12345)

    X, Y = load_breast_cancer(True)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    ngb = NGBClassifier(Base=default_tree_learner,
                        Dist=Bernoulli,
                        Score=MLE,
                        verbose=True,
                        natural_gradient=True)
    ngb.fit(X_train, Y_train)

    preds = ngb.pred_dist(X_test)
    print("ROC:", roc_auc_score(Y_test, preds.prob))
Пример #16
0
from ngboost.distns import k_categorical
from ngboost import NGBClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge

if __name__ == "__main__":
    # An example where the base learner is also searched over (this is how you would vary tree depth):

    X, Y = load_breast_cancer(True)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    b1 = DecisionTreeRegressor(criterion='friedman_mse', max_depth=2)
    b2 = DecisionTreeRegressor(criterion='friedman_mse', max_depth=4)
    b3 = Ridge(alpha=0.0)

    param_grid = {
        'n_estimators': [20, 50],
        'minibatch_frac': [1.0, 0.5],
        'Base': [b1, b2]
    }

    ngb = NGBClassifier(natural_gradient=True,
                        verbose=False,
                        Dist=k_categorical(2))

    grid_search = GridSearchCV(ngb, param_grid=param_grid, cv=5)
    grid_search.fit(X_train, Y_train)
    print(grid_search.best_params_)
Пример #17
0
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

from ngboost import NGBClassifier
from ngboost.distns import k_categorical

if __name__ == "__main__":

    X, y = load_breast_cancer(True)
    y[0:
      15] = 2  # artificially make this a 3-class problem instead of a 2-class problem
    X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

    ngb = NGBClassifier(Dist=k_categorical(
        3))  # tell ngboost that there are 3 possible outcomes
    ngb.fit(X_train, Y_train)  # Y should have only 3 values: {0,1,2}

    # predicted probabilities of class 0, 1, and 2 (columns) for each observation (row)
    preds = ngb.predict_proba(X_test)
Пример #18
0
def fixture_learners_data(breast_cancer_data, boston_data,
                          boston_survival_data):
    """
    Returns:
        A list of iterables,
        each iterable containing a fitted model and
        X data and the predictions for the X_data
    """

    models_data = []
    X_class_train, _, Y_class_train, _ = breast_cancer_data
    ngb = NGBClassifier(verbose=False, n_estimators=10)
    ngb.fit(X_class_train, Y_class_train)
    models_data.append((ngb, X_class_train, ngb.predict(X_class_train)))

    X_reg_train, _, Y_reg_train, _ = boston_data
    ngb = NGBRegressor(verbose=False, n_estimators=10)
    ngb.fit(X_reg_train, Y_reg_train)
    models_data.append((ngb, X_reg_train, ngb.predict(X_reg_train)))

    X_surv_train, _, T_surv_train, E_surv_train, _ = boston_survival_data
    ngb = NGBSurvival(verbose=False, n_estimators=10)
    ngb.fit(X_surv_train, T_surv_train, E_surv_train)
    models_data.append((ngb, X_surv_train, ngb.predict(X_surv_train)))

    ngb = NGBRegressor(Dist=MultivariateNormal(2), n_estimators=10)
    ngb.fit(X_surv_train, np.vstack([T_surv_train, E_surv_train]).T)
    models_data.append((ngb, X_surv_train, ngb.predict(X_surv_train)))
    return models_data
Пример #19
0
import numpy as np
from ngboost import NGBClassifier
from ngboost.distns import Bernoulli
from ngboost.learners import default_tree_learner

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

if __name__ == "__main__":

    np.random.seed(12345)

    X, Y = load_breast_cancer(True)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)

    ngb = NGBClassifier(Dist=Bernoulli)
    ngb.fit(X_train, Y_train)

    preds = ngb.pred_dist(X_test)
    print("ROC:", roc_auc_score(Y_test, preds.probs[1]))
Пример #20
0
 def ng_model(self):
     ngb_cat = NGBClassifier(Dist=k_categorical(2), verbose=True)
     ng_clf = ngb_cat.fit(self.X_t, self.y_t)
     print(ng_clf.feature_importances_)
     return ng_clf
Пример #21
0
 def make(self , params  ):
     self.model =  NGBClassifier(**params  )
     return self