Exemplo n.º 1
0
def test_copy_model():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model1 = CatBoostRegressor(iterations=5, random_seed=0)
    model1.fit(pool)
    model2 = model1.copy()
    predictions1 = model1.predict(pool)
    predictions2 = model2.predict(pool)
    assert _check_data(predictions1, predictions2)
    model2.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 2
0
def test_shap():
    train_pool = Pool([[0, 0], [0, 1], [1, 0], [1, 1]], [0, 1, 5, 8], cat_features=[])
    test_pool = Pool([[0, 0], [0, 1], [1, 0], [1, 1]])
    model = CatBoostRegressor(iterations=1, random_seed=0, max_ctr_complexity=1, depth=2)
    model.fit(train_pool)
    shap_values = model.get_feature_importance(test_pool, fstr_type='ShapValues')

    dataset = [(0.5, 1.2), (1.6, 0.5), (1.8, 1.0), (0.4, 0.6), (0.3, 1.6), (1.5, 0.2)]
    labels = [1.1, 1.85, 2.3, 0.7, 1.1, 1.6]
    train_pool = Pool(dataset, labels, cat_features=[])

    model = CatBoost({'iterations': 10, 'random_seed': 0, 'max_ctr_complexity': 1})
    model.fit(train_pool)

    testset = [(0.6, 1.2), (1.4, 0.3), (1.5, 0.8), (1.4, 0.6)]
    predictions = model.predict(testset)
    shap_values = model.get_feature_importance(Pool(testset), fstr_type='ShapValues')
    assert(len(predictions) == len(shap_values))
    for pred_idx in range(len(predictions)):
        assert(abs(sum(shap_values[pred_idx]) - predictions[pred_idx]) < 1e-9)

    with open(FIMP_PATH, 'w') as out:
        out.write(shap_values)

    local_canonical_file(FIMP_PATH)
Exemplo n.º 3
0
def test_coreml_import_export():
    train_pool = Pool(QUERYWISE_TRAIN_FILE, column_description=QUERYWISE_CD_FILE)
    test_pool = Pool(QUERYWISE_TEST_FILE, column_description=QUERYWISE_CD_FILE)
    model = CatBoost(params={'loss_function': 'QueryRMSE', 'random_seed': 0, 'iterations': 20, 'thread_count': 8})
    model.fit(train_pool)
    model.save_model(OUTPUT_COREML_MODEL_PATH, format="coreml")
    canon_pred = model.predict(test_pool)
    coreml_loaded_model = CatBoostRegressor()
    coreml_loaded_model.load_model(OUTPUT_COREML_MODEL_PATH, format="coreml")
    assert all(canon_pred == coreml_loaded_model.predict(test_pool))
    return local_canonical_file(OUTPUT_COREML_MODEL_PATH)
Exemplo n.º 4
0
#
#
# mod = xgb.XGBClassifier(n_estimators=10000,learning_rate=.4)
#
#
# eval_set = [(select_input_columns(ts), select_output_columns_as_row(ts))]
#
# mod.fit(select_input_columns(tr), select_output_columns_as_row(tr) ,eval_metric=xgb_f1, eval_set=eval_set, verbose=True)

#g=f1_score(select_output_columns_as_row(ts),mod.predict(select_input_columns(ts)))

########################################################################################################################
#def r():
#global input_columns_classify
clas = CatBoostClassifier(iterations=10000, eval_metric='F1')
reg = CatBoostRegressor(iterations=np.random.randint(1, 4), eval_metric='MAE')

cat_features = []
if 'weekday' in input_columns_regress: cat_features.append('weekday')
if 'time' in input_columns_regress: cat_features.append('time')
reg.fit(
    select_input_columns_regress(pd_vstack([tr, ts, oo])),
    select_output_columns_as_row_regress(pd_vstack([tr, ts, oo])),
    eval_set=[
        ((select_input_columns_regress(remove_label_true(pd_vstack([tr,
                                                                    ts])))),
         select_output_columns_as_row_regress(
             remove_label_true(pd_vstack([tr, ts])))),
        ((select_input_columns_regress(remove_label_false(pd_vstack([tr,
                                                                     ts])))),
         select_output_columns_as_row_regress(
Exemplo n.º 5
0
def catboost_train_regression(
    training_data_path: InputPath('CSV'),
    model_path: OutputPath('CatBoostModel'),
    starting_model_path: InputPath('CatBoostModel') = None,
    label_column: int = 0,
    loss_function: str = 'RMSE',
    num_iterations: int = 500,
    learning_rate: float = None,
    depth: int = 6,
    random_seed: int = 0,
    cat_features: list = None,
    additional_training_options: dict = {},
):
    '''Train a CatBoost classifier model.
    Args:
        training_data_path: Path for the training data in CSV format.
        model_path: Output path for the trained model in binary CatBoostModel format.
        starting_model_path: Path for the existing trained model to start from.
        label_column: Column containing the label data.
        loss_function: The metric to use in training and also selector of the machine learning
            problem to solve. Default = 'RMSE'. Possible values:
            'RMSE', 'MAE', 'Quantile:alpha=value', 'LogLinQuantile:alpha=value', 'Poisson', 'MAPE', 'Lq:q=value'
        num_iterations: Number of trees to add to the ensemble.
        learning_rate: Step size shrinkage used in update to prevents overfitting.
            Default value is selected automatically for binary classification with other parameters set to default.
            In all other cases default is 0.03.
        depth: Depth of a tree. All trees are the same depth. Default = 6
        random_seed: Random number seed. Default = 0
        cat_features: A list of Categorical features (indices or names).
        additional_training_options: A dictionary with additional options to pass to CatBoostRegressor
    Outputs:
        model: Trained model in binary CatBoostModel format.
    Annotations:
        author: Alexey Volkov <*****@*****.**>
    '''
    import tempfile
    from pathlib import Path

    from catboost import CatBoostRegressor, Pool

    column_descriptions = {label_column: 'Label'}
    column_description_path = tempfile.NamedTemporaryFile(delete=False).name
    with open(column_description_path, 'w') as column_description_file:
        for idx, kind in column_descriptions.items():
            column_description_file.write('{}\t{}\n'.format(idx, kind))

    train_data = Pool(
        training_data_path,
        column_description=column_description_path,
        has_header=True,
        delimiter=',',
    )

    model = CatBoostRegressor(
        iterations=num_iterations,
        depth=depth,
        learning_rate=learning_rate,
        loss_function=loss_function,
        random_seed=random_seed,
        verbose=True,
        **additional_training_options,
    )

    model.fit(
        train_data,
        cat_features=cat_features,
        init_model=starting_model_path,
        #verbose=False,
        #plot=True,
    )
    Path(model_path).parent.mkdir(parents=True, exist_ok=True)
    model.save_model(model_path)
Exemplo n.º 6
0
categorical_features = open("../excluded_categorical_columns.txt").read().splitlines()
independent_variables += categorical_features

# X=X.fillna(-1)
X = pd.DataFrame(data, columns=independent_variables)
# print(X.columns)
y = data[dependent_variable]

#  convert categorical columns to integers
cat_dims = [X.columns.get_loc(i) for i in categorical_features[:-1]]
for header in categorical_features:
    X[header] = X[header].astype('category').cat.codes

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

model = CatBoostRegressor()
# grid_parameters = {'depth': [3,1,2,6,4,5,7,8,9,10],
#               'learning_rate': [0.01, 0.03, 0.05, 0.07, 0.1, 0.13, 0.15,0,2],
#               'iterations': [30, 50, 100,200,400,600,800,100],
#               # 'loss_function': ['RMSE', 'MultiRMSE', 'MAE',  'Quantile', 'LogLinQuantile', 'Poisson'],
#               'l2_leaf_reg': [1, 3, 5, 7, 9, 10,50, 100],
#               # 'border_count':[32,5,10,20,50,100,200],
#               # 'ctr_border_count':[50,5,10,20,100,200],
#               }
# paramter old used
# grid_parameters = {'depth': [3,1,2,6,4,5,7,8,9,10],
#                     'learning_rate': [0.01,0.02,0.03,0.05,0.07, 0.1,0.15],
#               'iterations': [30, 50, 100,200,400,600,800,1000,1200],
#               'l2_leaf_reg': [1, 3, 5, 7, 9, 10,50, 100],
#                 # 'border_count':[32,5,10,20,50,100,200],
#               }
Exemplo n.º 7
0
def main(iterations):
    # Download train and validation datasets
    train_df, test_df = msrank()
    # Column 0 contains label values, column 1 contains group ids.
    X_train, y_train = train_df.drop([0, 1], axis=1).values, train_df[0].values
    X_test, y_test = test_df.drop([0, 1], axis=1).values, test_df[0].values

    # Split train data into two parts. First part - for baseline model,
    # second part - for major model
    splitted_data = train_test_split(X_train, y_train, test_size=0.5)
    X_train_first, X_train_second, y_train_first, y_train_second = splitted_data

    catboost_model = CatBoostRegressor(iterations=iterations, verbose=False)

    # Prepare simple baselines (just mean target on first part of train pool).
    baseline_value = y_train_first.mean()
    train_baseline = np.array([baseline_value] * y_train_second.shape[0])
    test_baseline = np.array([baseline_value] * y_test.shape[0])

    # Create pools
    train_pool = Pool(X_train_second, y_train_second, baseline=train_baseline)
    test_pool = Pool(X_test, y_test, baseline=test_baseline)

    # Train CatBoost model
    catboost_model.fit(train_pool, eval_set=test_pool, verbose=True, plot=False, save_snapshot=True)
    catboost_model.save_model("example.cbm")

    catboost_model = CatBoostRegressor()
    catboost_model.load_model("example.cbm")

    # Apply model on pool with baseline values
    preds1 = catboost_model.predict(test_pool)

    # Apply model on numpy.array and then add the baseline values
    preds2 = test_baseline + catboost_model.predict(X_test)

    # Check that preds have small diffs
    assert (np.abs(preds1 - preds2) < 1e-6).all()
Exemplo n.º 8
0
    def fit(self,
            X,
            y,
            sample_weight=None,
            eval_set=None,
            sample_weight_eval_set=None,
            **kwargs):
        logger = None
        if self._make_logger:
            # Example use of logger, with required import of:
            #  from h2oaicore.systemutils import make_experiment_logger, loggerinfo
            # Can use loggerwarning, loggererror, etc. for different levels
            if self.context and self.context.experiment_id:
                logger = make_experiment_logger(
                    experiment_id=self.context.experiment_id,
                    tmp_dir=self.context.tmp_dir,
                    experiment_tmp_dir=self.context.experiment_tmp_dir)

        if self._show_logger_test:
            loggerinfo(logger, "TestLOGGER: Fit CatBoost")

        if self._show_task_test:
            # Example task sync operations
            if hasattr(self, 'testcount'):
                self.test_count += 1
            else:
                self.test_count = 0

            # The below generates a message in the GUI notifications panel
            if self.test_count == 0 and self.context and self.context.experiment_id:
                warning = "TestWarning: First CatBoost fit for this model instance"
                loggerwarning(logger, warning)
                task = kwargs.get('task')
                if task:
                    task.sync(key=self.context.experiment_id,
                              progress=dict(type='warning', data=warning))
                    task.flush()

            # The below generates a message in the GUI top-middle panel above the progress wheel
            if self.test_count == 0 and self.context and self.context.experiment_id:
                message = "Tuning CatBoost"
                loggerinfo(logger, message)
                task = kwargs.get('task')
                if task:
                    task.sync(key=self.context.experiment_id,
                              progress=dict(type='update', message=message))
                    task.flush()

        from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType

        # label encode target and setup type of problem
        lb = LabelEncoder()
        if self.num_classes >= 2:
            lb.fit(self.labels)
            y = lb.transform(y)
            if eval_set is not None:
                valid_X = eval_set[0][0]
                valid_y = eval_set[0][1]
                valid_y = lb.transform(valid_y)
                eval_set = [(valid_X, valid_y)]
            self.params.update({'objective': 'Logloss'})
        if self.num_classes > 2:
            self.params.update({'objective': 'MultiClass'})

        if isinstance(X, dt.Frame):
            orig_cols = list(X.names)
            numeric_cols = list(X[:, [bool, int, float]].names)
        else:
            orig_cols = list(X.columns)
            numeric_cols = list(X.select_dtypes([np.number]).columns)

        # unlike lightgbm that needs label encoded categoricals, catboots can take raw strings etc.
        self.params['cat_features'] = [
            i for i, x in enumerate(orig_cols)
            if 'CatOrig:' in x or 'Cat:' in x or x not in numeric_cols
        ]

        if not self.get_uses_gpus(self.params):
            # monotonicity constraints not available for GPU for catboost
            # get names of columns in same order
            X_names = list(dt.Frame(X).names)
            X_numeric = self.get_X_ordered_numerics(X)
            X_numeric_names = list(X_numeric.names)
            self.set_monotone_constraints(X=X_numeric, y=y, params=self.params)
            numeric_constraints = copy.deepcopy(
                self.params['monotone_constraints'])
            # if non-numerics, then fix those to have 0 constraint
            self.params['monotone_constraints'] = [0] * len(X_names)
            colnumi = 0
            for coli in X_names:
                if X_names[coli] in X_numeric_names:
                    self.params['monotone_constraints'][
                        coli] = numeric_constraints[colnumi]
                    colnumi += 1

        if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0:
            # dt -> catboost internally using buffer leaks, so convert here
            # assume predict is after pipeline collection or in subprocess so needs no protection
            X = X.to_numpy(
            )  # don't assign back to X so don't damage during predict
            X = np.ascontiguousarray(X,
                                     dtype=np.float32 if config.data_precision
                                     == "float32" else np.float64)
            if eval_set is not None:
                valid_X = eval_set[0][0].to_numpy(
                )  # don't assign back to X so don't damage during predict
                valid_X = np.ascontiguousarray(
                    valid_X,
                    dtype=np.float32
                    if config.data_precision == "float32" else np.float64)
                valid_y = eval_set[0][1]
                eval_set = [(valid_X, valid_y)]

        if eval_set is not None:
            valid_X_shape = eval_set[0][0].shape
        else:
            valid_X_shape = None

        X, eval_set = self.process_cats(X, eval_set, orig_cols)

        # modify self.params_base['gpu_id'] based upon actually-available GPU based upon training and valid shapes
        self.acquire_gpus_function(train_shape=X.shape,
                                   valid_shape=valid_X_shape)

        params = copy.deepcopy(
            self.params
        )  # keep separate, since then can be pulled form lightgbm params
        params = self.transcribe_and_filter_params(params, eval_set
                                                   is not None)

        if logger is not None:
            loggerdata(
                logger,
                "CatBoost parameters: params_base : %s params: %s catboost_params: %s"
                % (str(self.params_base), str(self.params), str(params)))

        if self.num_classes == 1:
            model = CatBoostRegressor(**params)
        else:
            model = CatBoostClassifier(**params)
        # Hit sometimes: Exception: catboost/libs/data_new/quantization.cpp:779: All features are either constant or ignored.
        if self.num_classes == 1:
            # assume not mae, which would use median
            # baseline = [np.mean(y)] * len(y)
            baseline = None
        else:
            baseline = None

        kargs = dict(X=X,
                     y=y,
                     sample_weight=sample_weight,
                     baseline=baseline,
                     eval_set=eval_set)
        pickle_path = None
        if config.debug_daimodel_level >= 2:
            self.uuid = str(uuid.uuid4())[:6]
            pickle_path = "catboost%s.pickle" % self.uuid
            save_obj((model, kargs), pickle_path)

        # FIT
        model.fit(**kargs)

        if config.debug_daimodel_level <= 2:
            remove(pickle_path)

        # https://catboost.ai/docs/concepts/python-reference_catboostclassifier.html
        # need to move to wrapper
        if model.get_best_iteration() is not None:
            iterations = model.get_best_iteration() + 1
        else:
            iterations = self.params['n_estimators']
        # must always set best_iterations
        self.model_path = None
        importances = copy.deepcopy(model.feature_importances_)
        if not self._save_by_pickle:
            self.uuid = str(uuid.uuid4())[:6]
            model_file = "catboost_%s.bin" % str(self.uuid)
            self.model_path = os.path.join(self.context.experiment_tmp_dir,
                                           model_file)
            model.save_model(self.model_path)
            with open(self.model_path, mode='rb') as f:
                model = f.read()
        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=importances,
                                  iterations=iterations)
X = dw.drop(['weight'], axis=1)
y = dw.weight
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(
    X, y, train_size=0.8, random_state=42)

# In[34]:

categorical_features_indices = np.where(X.dtypes != np.float)[0]

# In[35]:

model = CatBoostRegressor(iterations=1,
                          depth=10,
                          learning_rate=0.1,
                          loss_function='RMSE',
                          use_best_model=True)
model.fit(X_train,
          y_train,
          cat_features=categorical_features_indices,
          eval_set=(X_validation, y_validation))

# In[84]:

da = ds.sample(frac=1)


def conv(s):
    if s < 15:
        return 0
Exemplo n.º 10
0
cab_oof_pred  = np.zeros_like(y, dtype=np.float)
lgbm_oof_pred = np.zeros_like(y, dtype=np.float)
scores, models = [], []
skf = StratifiedKFold(n_splits=N_SPLITS, random_state=RANDOM_SEED, shuffle=True)
for i, (train_idx, valid_idx) in enumerate(skf.split(train, train['Publisher'])):
    x_train, x_valid = train.iloc[train_idx], train.iloc[valid_idx]
    y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]
    
    # Publisherでfoldを割ってるので、trainはデータを分割した後にカラムをドロップ
    x_train = x_train.drop(drop_column, axis=1)
    x_valid = x_valid.drop(drop_column, axis=1)

    train_data = Pool(x_train, y_train)
    valid_data = Pool(x_valid, y_valid)

    model = CatBoostRegressor(**cab_params)
    model.fit(train_data, 
            eval_set=valid_data,
            early_stopping_rounds=50,
            verbose=False,
            use_best_model=True)
    cab_valid_pred = model.predict(x_valid)
    score = mean_squared_error(y_valid, cab_valid_pred) ** .5
    print(f'Fold {i} CAB RMSLE: {score}')

    cab_oof_pred[valid_idx] = cab_valid_pred
    models.append(model)
    scores.append(score)

    model = lgbm.LGBMRegressor(**lgbm_params)
    model.fit(x_train, y_train,
Exemplo n.º 11
0
        ])
        labels.append(evt.r)
        energy_true.append(evt.E0)
print(len(features))

#train_features = np.array(features[:820000])
test_features = np.array(features[820000:])
#eval_features = features[770000:820000]
#train_labels = labels[:820000]
test_labels = labels[820000:]
#energy_test = energy_true[820000:]
#eval_labels = labels[770000:820000]

print(len(test_features))

model = CatBoostRegressor(
)  #learning_rate = 0.1, iterations = 3000, depth=10, loss_function='RMSE')# l2_leaf_reg = 14, od_type = "Iter",od_wait = 50)

model.load_model("models/vertex.model")
#fit_model = model.fit(train_features, train_labels) #eval_set = (eval_features,eval_labels))
predictions = model.predict(test_features)

#print (fit_model.get_params())

#mse = mean_squared_error(test_labels, predictions)
#print("MSE: %.4f" % mse)

#model.save_model("vertex.model", format="cbm", export_parameters=None)

print(findmaximum(predictions - test_labels),
      findsigma(predictions - test_labels))
plt.hist(predictions - test_labels, bins=100, range=[-750, 750])
Exemplo n.º 12
0
import numpy
from catboost import CatBoostRegressor

dataset = numpy.array([[1, 4, 5, 6], [4, 5, 6, 7], [30, 40, 50, 60],
                       [20, 15, 85, 60]])
train_labels = [1.2, 3.4, 9.5, 24.5]
model = CatBoostRegressor(learning_rate=1, depth=6, loss_function='RMSE')
fit_model = model.fit(dataset, train_labels)

print fit_model.get_params()
    def get_base_estimator(self, model, create_nn_model=None):
        # keras config
        tf.random.set_seed(42)

        # torch config
        # for reproducibility
        torch.manual_seed(42)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        # gpu or cpu
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

        if model == 'log_reg':
            return LogisticRegression(solver='lbfgs')
        elif model == 'log_reg_cv':
            return LogisticRegressionCV()
        elif model == 'linear_reg':
            return LinearRegression()
        elif model == 'lasso':
            return Lasso()
        elif model == 'ridge':
            return Ridge()
        elif model == 'svc':
            return SVC()
        elif model == 'svr':
            return SVR()
        elif model == 'l_svc':
            return LinearSVC()
        elif model == 'l_svr':
            return LinearSVR()
        elif model == 'rf_clf':
            return RandomForestClassifier()
        elif model == 'rf_reg':
            return RandomForestRegressor()
        elif model == 'gbdt_clf':
            return GradientBoostingClassifier()
        elif model == 'gbdt_reg':
            return GradientBoostingRegressor()
        elif model == 'knn_clf':
            return KNeighborsClassifier()
        elif model == 'knn_reg':
            return KNeighborsRegressor()
        elif model == 'g_mix':
            return GaussianMixture()
        elif model == 'g_nb':
            return GaussianNB()
        elif model == 'preceptron':
            return Perceptron()
        elif model == 'sgd_clf':
            return SGDClassifier()
        elif model == 'sgd_reg':
            return SGDRegressor()
        elif model == 'dt_clf':
            return DecisionTreeClassifier()
        elif model == 'dt_reg':
            return DecisionTreeRegressor()
        elif model == 'xgb_clf':
            return XGBClassifier()
        elif model == 'xgb_reg':
            return XGBRegressor()
        elif model == 'lgb_clf':
            return LGBMClassifier()
        elif model == 'lgb_reg':
            return LGBMRegressor()
        elif model == 'catb_clf':
            return CatBoostClassifier()
        elif model == 'catb_reg':
            return CatBoostRegressor()
        elif model == 'rgf_clf':
            return RGFClassifier()
        elif model == 'rgf_reg':
            return RGFRegressor()
        elif model == 'keras_clf':
            return MyKerasClassifier(build_fn=create_nn_model)
        elif model == 'keras_reg':
            return MyKerasRegressor(build_fn=create_nn_model)
        elif model == 'torch_clf':
            return NeuralNetClassifier(module=create_nn_model(),
                                       device=device,
                                       train_split=None)
        elif model == 'torch_reg':
            return NeuralNetRegressor(module=create_nn_model(),
                                      device=device,
                                      train_split=None)
        elif model == 'tabnet_clf':
            return TabNetClassifier()
        elif model == 'tabnet_reg':
            return TabNetRegressor()
        else:
            logger.error('NOT IMPLEMENTED BASE MODEL: %s' % model)
            raise Exception('NOT IMPLEMENTED')
Exemplo n.º 14
0
    k_y_train = Y_data[train_index]
    k_x_vali = X_data[vali_index]
    k_y_vali = Y_data[vali_index]
    cb_params = {
        'n_estimators': 1000000,
        'loss_function': 'MAE',
        'eval_metric': 'MAE',
        'learning_rate': 0.02,
        'depth': 6,
        'use_best_model': True,
        'subsample': 0.6,
        'bootstrap_type': 'Bernoulli',
        'reg_lambda': 3,
        'one_hot_max_size': 2,
    }
    model_cb = CatBoostRegressor(**cb_params)
    # train the model
    model_cb.fit(k_x_train, k_y_train, eval_set=[(k_x_vali, k_y_vali)], verbose=300, early_stopping_rounds=300)
    oof_cb[vali_index] = model_cb.predict(k_x_vali, ntree_end=model_cb.best_iteration_)
    predictions_cb += model_cb.predict(X_test, ntree_end=model_cb.best_iteration_) / kfolder.n_splits
    predictions_train_cb += model_cb.predict(X_data, ntree_end=model_cb.best_iteration_) / kfolder.n_splits

print("catboost score: {:<8.8f}".format(mean_absolute_error(np.expm1(oof_cb), np.expm1(Y_data))))

output_path = path + '/user_data/'
# 测试集输出
predictions = predictions_cb
predictions[predictions < 0] = 0
sub = pd.DataFrame()
sub['SaleID'] = TestA_data.SaleID
sub['price'] = predictions
    def grid(self, method, params={}, ver=2, griall=False):

        from sklearn.model_selection import GridSearchCV

        if method == 'mlp':  # eğer mlp yaparsan scale ediyor datanı

            from sklearn.preprocessing import StandardScaler
            scaler = StandardScaler()
            scaler.fit(self.X_train)
            X_train = scaler.transform(self.X_train)
            X_test = scaler.transform(self.X_test)
        else:
            X_train = self.X_train
            X_test = self.X_test

        if params:
            if method == 'rf':
                from sklearn.ensemble import RandomForestRegressor
                classifier = RandomForestRegressor()
                grid_params = params

            elif method == 'dt':
                from sklearn.tree import DecisionTreeRegressor

                classifier = DecisionTreeRegressor()
                grid_params = params

            elif method == 'mlp':
                from sklearn.neural_network import MLPRegressor
                classifier = MLPRegressor()
                grid_params = params

            elif method == 'lr':
                from sklearn.linear_model import LinearRegression
                classifier = LinearRegression()
                grid_params = params

            elif method == 'gbm':
                from sklearn.ensemble import GradientBoostingRegressor
                classifier = GradientBoostingRegressor()
                grid_params = params

            elif method == 'xgb':
                from xgboost import XGBRegressor
                classifier = XGBRegressor()
                grid_params = params

            elif method == 'lgbm':
                from lightgbm import LGBMRegressor
                classifier = LGBMRegressor()
                grid_params = params

            elif method == 'cat':
                from catboost import CatBoostRegressor
                classifier = CatBoostRegressor(silent=True)
                grid_params = params
            elif method == 'svm':
                from sklearn.svm import SVR
                classifier = SVR()
                grid_params = params
            elif method == 'knn':
                from sklearn.neighbors import KNeighborsRegressor
                knn = KNeighborsRegressor()
                grid_params = params

            else:
                print('Unknown method')
                return

        else:
            if method == 'rf':
                from sklearn.ensemble import RandomForestRegressor
                classifier = RandomForestRegressor()
                grid_params = {
                    "max_depth": [8, 10, 11, 13, 15, 18],
                    "max_features": [5, 10, 15, 20],
                    "n_estimators": [5, 10, 50, 100, 200, 500],
                    "min_samples_split": [3, 5, 10],
                    "criterion": ['mse', 'mae']
                }

            elif method == 'dt':
                from sklearn.tree import DecisionTreeRegressor
                classifier = DecisionTreeRegressor()
                grid_params = {
                    "max_depth": range(1, 10),
                    "min_samples_split": list(range(2, 50)),
                    "criterion": ['mse', 'mae']
                }

            elif method == 'mlp':
                from sklearn.neural_network import MLPRegressor
                classifier = MLPRegressor()
                grid_params = {
                    'alpha': [0.1, 0.01, 0.001, 0.005, 0.0001, 0.00001],
                    'hidden_layer_sizes': [(10, 10, 10), (45, 50, 60),
                                           (25, 35, 45), (15, 15), (100, ),
                                           (100, 100)],
                    'solver': ['lbfgs', 'adam', 'sgd'],
                    'activation': ['relu', 'logistic', 'tanh', 'identity']
                }

            elif method == 'gbm':
                from sklearn.ensemble import GradientBoostingRegressor
                classifier = GradientBoostingRegressor()
                grid_params = {
                    'loss': ['ls', 'lad', 'huber', 'quantile'],
                    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
                    'n_estimators': [100, 500, 1000, 1500],
                    'max_depth': [3, 5, 6],
                    'min_samples_split': [2, 5, 10, 15],
                    'subsample': [0.6, 1.0]
                }

            elif method == 'xgb':
                from xgboost import XGBRegressor
                classifier = XGBRegressor()
                grid_params = {
                    'colsample_bytree': [0.6, 1.0],
                    'n_estimators': [100, 200, 500, 1000],
                    'max_depth': [4, 5, 6, 7],
                    'min_child_weight': [0.8, 0.9, 1],
                    'learning_rate': [0.1, 0.01, 0.02, 0.05]
                }

            elif method == 'lgbm':
                from lightgbm import LGBMRegressor
                classifier = LGBMRegressor()
                grid_params = {
                    'subsample': [0.6, 0.8, 1.0],
                    'n_estimators': [100, 500, 1000, 1500],
                    'max_depth': [4, 5, 6, 7],
                    'min_child_samples': [10, 20],
                    'learning_rate': [0.2, 0.1, 0.01, 0.02, 0.05],
                    'importance_type': ['gains', 'split']
                }

            elif method == 'cat':
                from catboost import CatBoostRegressor
                classifier = CatBoostRegressor(silent=True)
                grid_params = {
                    'iterations': [200, 500],
                    'learning_rate': [0.01, 0.02, 0.05],
                    'depth': [3, 5, 8]
                }

            elif method == 'svm':
                from sklearn.svm import SVR
                classifier = SVR()
                grid_params = {
                    'C': np.arange(0.1, 2, 0.1),
                    'kernel': ['linear', 'rbf', 'poly']
                }

            elif method == 'knn':
                from sklearn.neighbors import KNeighborsRegressor
                classifier = KNeighborsRegressor()
                grid_params = {
                    'n_neighbors': np.arange(1, 40),
                    'weights': ['uniform', 'distance'],
                    'metric': ['minkowski', 'euclidean', 'manhattan']
                }

            else:
                print('Unknown method')
                return

        grid_cv = GridSearchCV(classifier,
                               grid_params,
                               cv=5,
                               n_jobs=-1,
                               verbose=ver)
        grid_cv_model = grid_cv.fit(X_train, self.y_train)

        # en iyi parametleri bastırıp onları kullanarak model kuruyor
        print("En iyi parametlerler: " + str(grid_cv_model.best_params_))

        if method == 'rf':

            classifier = RandomForestRegressor(
                max_depth=grid_cv_model.best_params_['max_depth'],
                max_features=grid_cv_model.best_params_['max_features'],
                n_estimators=grid_cv_model.best_params_['n_estimators'],
                min_samples_split=grid_cv_model.
                best_params_['min_samples_split'],
                criterion=grid_cv_model.best_params_['criterion'])

        elif method == 'dt':

            classifier = DecisionTreeRegressor(
                max_depth=grid_cv_model.best_params_['max_depth'],
                min_samples_split=grid_cv_model.
                best_params_['min_samples_split'],
                criterion=grid_cv_model.best_params_['criterion'])

        elif method == 'mlp':
            classifier = MLPRegressor(
                alpha=grid_cv_model.best_params_['alpha'],
                hidden_layer_sizes=grid_cv_model.
                best_params_['hidden_layer_sizes'],
                solver=grid_cv_model.best_params_['solver'],
                activation=grid_cv_model.best_params_['activation'])

        elif method == 'gbm':
            from sklearn.ensemble import GradientBoostingRegressor
            classifier = GradientBoostingRegressor(
                learning_rate=grid_cv_model.best_params_['learning_rate'],
                n_estimators=grid_cv_model.best_params_['n_estimators'],
                max_depth=grid_cv_model.best_params_['max_depth'],
                min_samples_split=grid_cv_model.
                best_params_['min_samples_split'],
                loss=grid_cv_model.best_params_['loss'],
                subsample=grid_cv_model.best_params_['subsample'])

        elif method == 'xgb':
            from xgboost import XGBRegressor
            classifier = XGBRegressor(
                colsample_bytree=grid_cv_model.
                best_params_['colsample_bytree'],
                n_estimators=grid_cv_model.best_params_['n_estimators'],
                max_depth=grid_cv_model.best_params_['max_depth'],
                min_child_weight=grid_cv_model.
                best_params_['min_child_weight'],
                learning_rate=grid_cv_model.best_params_['learning_rate'])

        elif method == 'lgbm':
            from lightgbm import LGBMRegressor
            classifier = LGBMRegressor(
                subsample=grid_cv_model.best_params_['subsample'],
                n_estimators=grid_cv_model.best_params_['n_estimators'],
                max_depth=grid_cv_model.best_params_['max_depth'],
                min_child_samples=grid_cv_model.
                best_params_['min_child_samples'],
                learning_rate=grid_cv_model.best_params_['learning_rate'],
                importance_type=grid_cv_model.best_params_['importance_type'])

        elif method == 'cat':
            from catboost import CatBoostRegressor
            classifier = CatBoostRegressor(
                silent=True,
                iterations=grid_cv_model.best_params_['iterations'],
                learning_rate=grid_cv_model.best_params_['learning_rate'],
                depth=grid_cv_model.best_params_['depth'])

        elif method == 'svm':
            from sklearn.svm import SVR
            classifier = SVR(C=grid_cv_model.best_params_['C'],
                             kernel=grid_cv_model.best_params_['kernel'])
        elif method == 'knn':
            from sklearn.neighbors import KNeighborsRegressor
            knn = KNeighborsRegressor(
                n_neighbors=grid_cv_model.best_params_['n_neighbors'],
                weights=grid_cv_model.best_params_['weights'],
                metric=grid_cv_model.best_params_['metric'])

        print('Result for ', method)
        from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score
        classifier.fit(X_train, self.y_train)
        y_pred = classifier.predict(X_test)

        print(np.sqrt(mean_squared_error(self.y_test, y_pred)))
        if griall:
            self.rmse.append(np.sqrt(mean_squared_error(self.y_test, y_pred)))
    def default_processes(
        self
    ):  #Bildiğimiz classification yöntemlerini hiçbir hyperparametre değiştirmeden uyguluyor

        from warnings import filterwarnings

        filterwarnings('ignore')
        from sklearn.linear_model import LinearRegression
        from sklearn.tree import DecisionTreeRegressor
        from sklearn.ensemble import RandomForestRegressor
        from sklearn.neural_network import MLPRegressor
        from sklearn.ensemble import GradientBoostingRegressor
        from xgboost import XGBRegressor
        from lightgbm import LGBMRegressor
        from catboost import CatBoostRegressor
        from sklearn.neighbors import KNeighborsRegressor

        from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score

        acc = []
        acc_colmns = [
            'lr', 'dtc', 'rfc', 'mlpc', 'svm', 'gbm', 'xgb', 'lgbc',
            'catboost', 'knn'
        ]

        lr = LinearRegression()
        lr.fit(self.X_train, self.y_train)
        y_pred = lr.predict(self.X_test)
        print('Results for default LinearRegression ')
        print(np.sqrt(mean_squared_error(self.y_test, y_pred)))
        acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred)))

        dtc = DecisionTreeRegressor()
        dtc.fit(self.X_train, self.y_train)
        y_pred = dtc.predict(self.X_test)
        print('Results for default decision tree')
        print(np.sqrt(mean_squared_error(self.y_test, y_pred)))
        acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred)))

        rfc = RandomForestRegressor()
        rfc.fit(self.X_train, self.y_train)
        y_pred = rfc.predict(self.X_test)
        print('Results for default random forest')
        print(np.sqrt(mean_squared_error(self.y_test, y_pred)))
        acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred)))

        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        scaler.fit(self.X_train)
        X_train_scaled = scaler.transform(self.X_train)
        X_test_scaled = scaler.transform(self.X_test)

        mlpc = MLPRegressor()
        mlpc.fit(X_train_scaled, self.y_train)
        y_pred = mlpc.predict(X_test_scaled)
        print('Results for default MLP')
        print(np.sqrt(mean_squared_error(self.y_test, y_pred)))
        acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred)))

        from sklearn.svm import SVR
        svm = SVR().fit(self.X_train, self.y_train)
        print('Results for default Gradient Boosting ')
        print(np.sqrt(mean_squared_error(self.y_test, y_pred)))
        acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred)))

        gbc = GradientBoostingRegressor()
        gbc.fit(self.X_train, self.y_train)
        y_pred = gbc.predict(X_test_scaled)
        print('Results for default Gradient Boosting ')
        print(np.sqrt(mean_squared_error(self.y_test, y_pred)))
        acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred)))

        xgb = XGBRegressor().fit(self.X_train, self.y_train)
        y_pred = xgb.predict(self.X_test)
        print('Results for default XGBoost ')
        print(np.sqrt(mean_squared_error(self.y_test, y_pred)))
        acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred)))

        lgbm = LGBMRegressor().fit(self.X_train, self.y_train)
        print('Results for default LGBM')
        y_pred = lgbm.predict(self.X_test)
        print(np.sqrt(mean_squared_error(self.y_test, y_pred)))
        acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred)))

        cat = CatBoostRegressor(silent=True).fit(self.X_train, self.y_train)

        print('Results for default CatBoost')
        y_pred = cat.predict(self.X_test)
        print(np.sqrt(mean_squared_error(self.y_test, y_pred)))
        acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred)))

        knn = KNeighborsRegressor().fit(self.X_train, self.y_train)
        print('Results for default KNeighborsRegressor')
        y_pred = knn.predict(self.X_test)
        print(np.sqrt(mean_squared_error(self.y_test, y_pred)))
        acc.append(np.sqrt(mean_squared_error(self.y_test, y_pred)))

        acc = [i * 100 for i in acc]

        accuracy = pd.DataFrame({"RMSE": acc}, index=acc_colmns)
        accuracy.sort_values(by="RMSE", axis=0,
                             ascending=True).plot(kind="barh", color="r")
Exemplo n.º 17
0
df_kag = sc_x.transform(df_kag)

# ### train-test split

# In[6]:

X_train, X_test, y_train, y_test = model_selection.train_test_split(
    df, income, test_size=0.15)

# ### Train on catboostregressor

# In[7]:

reg = CatBoostRegressor(iterations=2000,
                        eval_metric='RMSE',
                        depth=8,
                        bagging_temperature=0.2,
                        learning_rate=0.02)
reg.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

# ### predict and get RMSE

# In[8]:

# Test
y_pred = reg.predict(X_test)

print('Root Mean Squared Error:',
      np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

# ### publish results
Exemplo n.º 18
0
def train_model(X,
                X_test,
                y,
                params=None,
                folds=folds,
                model_type='lgb',
                plot_feature_importance=False,
                model=None):

    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print('Fold', fold_n, 'started at', time.ctime())
        if type(X) == np.ndarray:
            X_train, X_valid = X[train_index], X[valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X.iloc[train_index], X.iloc[valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params, n_estimators=50000, n_jobs=-1)
            model.fit(X_train,
                      y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric='mae',
                      verbose=10000,
                      early_stopping_rounds=200)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train,
                                     label=y_train,
                                     feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid,
                                     label=y_valid,
                                     feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data,
                              num_boost_round=20000,
                              evals=watchlist,
                              early_stopping_rounds=200,
                              verbose_eval=500,
                              params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid,
                                                     feature_names=X.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test,
                                               feature_names=X.columns),
                                   ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)

            y_pred_valid = model.predict(X_valid).reshape(-1, )
            score = mean_absolute_error(y_valid, y_pred_valid)
            print(f'Fold {fold_n}. MAE: {score:.4f}.')
            print('')

            y_pred = model.predict(X_test).reshape(-1, )

        if model_type == 'cat':
            model = CatBoostRegressor(iterations=20000,
                                      eval_metric='MAE',
                                      **params)
            model.fit(X_train,
                      y_train,
                      eval_set=(X_valid, y_valid),
                      cat_features=[],
                      use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)

        oof[valid_index] = y_pred_valid.reshape(-1, )
        scores.append(mean_absolute_error(y_valid, y_pred_valid))

        prediction += y_pred

        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)

    prediction /= n_fold

    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(
        np.mean(scores), np.std(scores)))

    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[[
                "feature", "importance"
            ]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[
                feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12))
            sns.barplot(x="importance",
                        y="feature",
                        data=best_features.sort_values(by="importance",
                                                       ascending=False))
            plt.title('LGB Features (avg over folds)')

            return oof, prediction, feature_importance
        return oof, prediction, scores

    else:
        return oof, prediction, scores
Exemplo n.º 19
0
def catboost_regressor_learner(df: pd.DataFrame,
                               features: List[str],
                               target: str,
                               learning_rate: float = 0.1,
                               num_estimators: int = 100,
                               extra_params: Dict[str, Any] = None,
                               prediction_column: str = "prediction",
                               weight_column: str = None) -> LearnerReturnType:
    """
    Fits an CatBoost regressor to the dataset. It first generates a Pool
    with the specified features and labels from `df`. Then it fits a CatBoost
    model to this Pool. Return the predict function for the model and the
    predictions for the input dataset.

    Parameters
    ----------

    df : pandas.DataFrame
        A Pandas' DataFrame with features and target columns.
        The model will be trained to predict the target column
        from the features.

    features : list of str
        A list os column names that are used as features for the model. All this names
        should be in `df`.

    target : str
        The name of the column in `df` that should be used as target for the model.
        This column should be numerical and continuous, since this is a regression model.

    learning_rate : float
        Float in range [0,1].
        Step size shrinkage used in update to prevents overfitting. After each boosting step,
        we can directly get the weights of new features. and eta actually shrinks the
        feature weights to make the boosting process more conservative.
        See the eta hyper-parameter in:
        https://catboost.ai/docs/concepts/python-reference_parameters-list.html

    num_estimators : int
        Int in range [0, inf]
        Number of boosted trees to fit.
        See the n_estimators hyper-parameter in:
        https://catboost.ai/docs/concepts/python-reference_parameters-list.html

    extra_params : dict, optional
        Dictionary in the format {"hyperparameter_name" : hyperparameter_value.
        Other parameters for the CatBoost model. See the list in:
        https://catboost.ai/docs/concepts/python-reference_catboostregressor.html
        If not passed, the default will be used.

    prediction_column : str
        The name of the column with the predictions from the model.

    weight_column : str, optional
        The name of the column with scores to weight the data.
    """
    from catboost import Pool, CatBoostRegressor
    import catboost

    weights = df[weight_column].values if weight_column else None
    params = extra_params if extra_params else {}
    params = assoc(params, "eta", learning_rate)

    dtrain = Pool(df[features].values,
                  df[target].values,
                  weight=weights,
                  feature_names=list(map(str, features)))
    cat_boost_regressor = CatBoostRegressor(iterations=num_estimators,
                                            **params)
    cbr = cat_boost_regressor.fit(dtrain, verbose=0)

    def p(new_df: pd.DataFrame, apply_shap: bool = False) -> pd.DataFrame:
        dtest = Pool(new_df[features].values,
                     feature_names=list(map(str, features)))
        col_dict = {prediction_column: cbr.predict(dtest)}

        if apply_shap:
            import shap
            explainer = shap.TreeExplainer(cbr)
            shap_values = list(explainer.shap_values(new_df[features]))
            shap_expected_value = explainer.expected_value

            shap_output = {
                "shap_values":
                shap_values,
                "shap_expected_value":
                np.repeat(shap_expected_value, len(shap_values))
            }

            col_dict = merge(col_dict, shap_output)

        return new_df.assign(**col_dict)

    p.__doc__ = learner_pred_fn_docstring("CatBoostRegressor", shap=False)

    log = {
        'catboost_regression_learner': {
            'features': features,
            'target': target,
            'prediction_column': prediction_column,
            'package': "catboost",
            'package_version': catboost.__version__,
            'parameters': assoc(params, "num_estimators", num_estimators),
            'feature_importance': cbr.feature_importances_,
            'training_samples': len(df)
        }
    }

    return p, p(df), log
Exemplo n.º 20
0
    'max_depth': 7,
    'boosting': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'is_training_metric': False,
    'seed': 18
}
#lgb_model = lgb.train(params, lgb.Dataset(x1, label=y1), 100, lgb.Dataset(x2, label=y2), feval=lgb_rmse, verbose_eval=10, early_stopping_rounds=20)
#test['item_cnt_month'] = lgb_model.predict(test[col], num_iteration=lgb_model.best_iteration)
#test[['ID','item_cnt_month']].to_csv('lgb_submission.csv', index=False)

#CatBoost
cb_model = CatBoostRegressor(iterations=100,
                             learning_rate=0.2,
                             depth=7,
                             loss_function='RMSE',
                             eval_metric='RMSE',
                             random_seed=18,
                             od_type='Iter',
                             od_wait=20)
cb_model.fit(x1, y1, eval_set=(x2, y2), use_best_model=True, verbose=False)
print(
    'RMSE:',
    np.sqrt(
        metrics.mean_squared_error(y2.clip(0., 20.),
                                   cb_model.predict(x2).clip(0., 20.))))
test['item_cnt_month'] += cb_model.predict(test[col])
test['item_cnt_month'] /= 2
test[['ID', 'item_cnt_month']].to_csv('cb_blend_submission.csv', index=False)

# In[ ]:
Exemplo n.º 21
0
hc_pipeline = make_pipeline(ce.GLMMEncoder())


column_transformer = ColumnTransformer(transformers=\
                                       [('numeric_pipeline',
                                         numeric_pipeline,
                                         select_numeric_features),\
                                        ('oh_pipeline',
                                         oh_pipeline,
                                         select_oh_features),\
                                        ('hc_pipeline',
                                         hc_pipeline,
                                         select_hc_features)
                                       ],\
                                       n_jobs=n_threads,
                                       remainder='drop')

#### create pipeline ####

cat = CatBoostRegressor(thread_count=n_threads,
                        n_estimators=N_ESTIMATORS,
                        random_state=SEED,
                        verbose=False)


pipe = Pipeline(steps=[('column_transformer', column_transformer),\
                       ('variancethreshold', VarianceThreshold(threshold=0.0)),\
                       ('selectpercentile', SelectPercentile(f_regression, percentile=90)),\
                       ('model', cat)])

_ = pipe.fit(train_df, log_y_train)
Exemplo n.º 22
0
                knclf = KNeighborsClassifier(n_neighbors=5)
                y_kn = [1 if x > 170 else 0 for x in y_train]
                knclf.fit(X_train_nona, y_kn)
                X_train['high_low_ind'] = knclf.predict(X_train_nona)
                X_valid['high_low_ind'] = knclf.predict(X_valid_nona)
                X_test_type['high_low_ind'] = knclf.predict(
                    X_test_type[X_train_nona.columns])
            train_dataset = Pool(data=X_train, label=y_train)
            valid_dataset = Pool(data=X_valid, label=y_valid)
            test_dataset = Pool(data=X_test_type)
            model = CatBoostRegressor(
                iterations=N_ESTIMATORS,
                learning_rate=LEARNING_RATE,
                depth=DEPTH,
                eval_metric=EVAL_METRIC,
                verbose=VERBOSE,
                random_state=RANDOM_STATE,
                thread_count=N_THREADS,
                #loss_function=EVAL_METRIC,
                # bootstrap_type='Poisson',
                # bagging_temperature=5,
                task_type="GPU")  # Train on GPU

            model.fit(train_dataset,
                      eval_set=valid_dataset,
                      early_stopping_rounds=500)
            now = timer()
            update_tracking(run_id,
                            '{}_tr_sec_f{}'.format(bond_type, fold_n + 1),
                            (now - fold_start),
                            integer=True)
            logger.info('Saving model file')
Exemplo n.º 23
0
def train_model_regression(X,
                           X_test,
                           y,
                           params,
                           folds,
                           model_type='lgb',
                           eval_metric='mae',
                           columns=None,
                           plot_feature_importance=False,
                           model=None,
                           verbose=10000,
                           early_stopping_rounds=200,
                           n_estimators=50000,
                           mol_type=-1,
                           fold_group=None,
                           skip_folds=None,
                           phase_mark="",
                           skipped_mark=[]):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.

    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type

    """
    assert isinstance(skip_folds, list) or skip_folds is None
    print(f"skip_folds :{skip_folds}")

    columns = X.columns if columns is None else columns
    X_test = X_test[columns]

    # to set up scoring parameters
    metrics_dict = {
        'mae': {
            'lgb_metric_name': 'mae',
            'catboost_metric_name': 'MAE',
            'sklearn_scoring_function': metrics.mean_absolute_error
        },
        'group_mae': {
            'lgb_metric_name': 'mae',
            'catboost_metric_name': 'MAE',
            'scoring_function': group_mean_log_mae
        },
        'mse': {
            'lgb_metric_name': 'mse',
            'catboost_metric_name': 'MSE',
            'sklearn_scoring_function': metrics.mean_squared_error
        }
    }

    result_dict = {}

    # out-of-fold predictions on train data
    oof = np.zeros(len(X))

    # averaged predictions on train data
    prediction = np.zeros(len(X_test))

    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()
    model_list = []

    # split and train on folds
    for fold_n, (train_index,
                 valid_index) in enumerate(folds.split(X, groups=fold_group)):

        if skip_folds is not None and fold_n in skip_folds and phase_mark in skipped_mark:
            print(f'Fold {fold_n + 1} is skipped!!! at {time.ctime()}')
            oof = unpickle(mid_path / f"oof_cv{phase_mark}_{fold_n}.pkl", )
            y_pred = unpickle(
                mid_path / f"prediction_cv{phase_mark}_{fold_n}.pkl", )
            model = unpickle(mid_path / f"model_cv{phase_mark}_{fold_n}.pkl", )
            fold_importance = unpickle(
                mid_path / f"importance_cv{phase_mark}_{fold_n}.pkl", )

            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)
            prediction += y_pred
            model_list += [model]
            continue

        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[
                valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params,
                                      n_estimators=n_estimators,
                                      n_jobs=-1,
                                      importance_type='gain')
            print(model)
            model.fit(X_train,
                      y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                      verbose=verbose,
                      early_stopping_rounds=early_stopping_rounds)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train,
                                     label=y_train,
                                     feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid,
                                     label=y_valid,
                                     feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            params["objective"] = "reg:linear"
            params["eval_metric"] = metrics_dict[eval_metric][
                'lgb_metric_name']
            model = xgb.train(dtrain=train_data,
                              num_boost_round=20000,
                              evals=watchlist,
                              early_stopping_rounds=200,
                              verbose_eval=verbose,
                              params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid,
                                                     feature_names=X.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test,
                                               feature_names=X.columns),
                                   ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)

            y_pred_valid = model.predict(X_valid).reshape(-1, )
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](
                y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')

            y_pred = model.predict(X_test).reshape(-1, )

        if model_type == 'cat':
            model = CatBoostRegressor(
                iterations=20000,
                eval_metric=metrics_dict[eval_metric]['catboost_metric_name'],
                **params,
                loss_function=metrics_dict[eval_metric]
                ['catboost_metric_name'])
            model.fit(X_train,
                      y_train,
                      eval_set=(X_valid, y_valid),
                      cat_features=[],
                      use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)

        oof[valid_index] = y_pred_valid.reshape(-1, )

        if eval_metric != 'group_mae':
            scores.append(
                metrics_dict[eval_metric]['sklearn_scoring_function'](
                    y_valid, y_pred_valid))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](
                y_valid, y_pred_valid, X_valid['type']))

        prediction += y_pred

        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1

            try:
                fold_importance.to_csv(mid_path /
                                       f"importance_cv_{fold_n}.csv")
            except Exception as e:
                print("failed to save importance...")
                print(e)

            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)
        model_list += [model]

        try:
            to_pickle(mid_path / f"oof_cv{phase_mark}_{fold_n}.pkl", oof)
            to_pickle(mid_path / f"prediction_cv{phase_mark}_{fold_n}.pkl",
                      y_pred)
            to_pickle(mid_path / f"model_cv{phase_mark}_{fold_n}.pkl", model)
            to_pickle(mid_path / f"importance_cv{phase_mark}_{fold_n}.pkl",
                      fold_importance)
        except Exception as e:
            print("failed to save intermediate data...")
            print(e)

    if model_type == 'lgb' and plot_feature_importance:
        result_dict['importance'] = feature_importance

    prediction /= folds.n_splits
    try:
        cv_score_msg = f'{DATA_VERSION}_{TRIAL_NO}' + ' CV mean score: {0:.4f}, std: {1:.4f}.'.format(
            np.mean(scores), np.std(scores))
        print(cv_score_msg)
        send_message(cv_score_msg)
    except Exception as e:
        print(e)
        pass

    result_dict["models"] = model_list
    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores

    return result_dict
Exemplo n.º 24
0
    def predict(self, X, **kwargs):
        model, features, importances, iterations = self.get_model_properties()
        if not self._save_by_pickle:
            from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType
            if self.num_classes >= 2:
                from_file = CatBoostClassifier()
            else:
                from_file = CatBoostRegressor()
            with open(self.model_path, mode='wb') as f:
                f.write(model)
            model = from_file.load_model(self.model_path)

        # FIXME: Do equivalent throttling of predict size like def _predict_internal(self, X, **kwargs), wrap-up.
        if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0:
            # dt -> lightgbm internally using buffer leaks, so convert here
            # assume predict is after pipeline collection or in subprocess so needs no protection
            X = X.to_numpy(
            )  # don't assign back to X so don't damage during predict
            X = np.ascontiguousarray(X,
                                     dtype=np.float32 if config.data_precision
                                     == "float32" else np.float64)

        X, eval_set = self.process_cats(X, None, self.feature_names_fitted)

        pred_contribs = kwargs.get('pred_contribs', None)
        output_margin = kwargs.get('output_margin', None)
        fast_approx = kwargs.pop('fast_approx', False)
        if fast_approx:
            kwargs['ntree_limit'] = min(config.fast_approx_num_trees,
                                        iterations - 1)
            kwargs['approx_contribs'] = pred_contribs
        else:
            kwargs['ntree_limit'] = iterations - 1

        # implicit import
        from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType
        n_jobs = max(1, physical_cores_count)
        if not pred_contribs:
            if self.num_classes >= 2:
                preds = model.predict_proba(
                    data=X,
                    ntree_start=0,
                    ntree_end=iterations - 1,
                    thread_count=self.params_base.get(
                        'n_jobs', n_jobs),  # -1 is not supported
                )

                if preds.shape[1] == 2:
                    return preds[:, 1]
                else:
                    return preds
            else:
                return model.predict(
                    data=X,
                    ntree_start=0,
                    ntree_end=iterations - 1,
                    thread_count=self.params_base.get(
                        'n_jobs', n_jobs),  # -1 is not supported
                )
        else:
            # For Shapley, doesn't come from predict, instead:
            return model.get_feature_importance(
                data=X,
                ntree_start=0,
                ntree_end=iterations - 1,
                thread_count=self.params_base.get(
                    'n_jobs', n_jobs),  # -1 is not supported,
                type=EFstrType.ShapValues)
Exemplo n.º 25
0
X_train = train_df[train_features]
y_train = train_df.logerror
print(X_train.shape, y_train.shape)

test_df['transactiondate'] = pd.Timestamp('2017-12-01')
test_df = add_date_features(test_df)
X_test = test_df[train_features]
print(X_test.shape)

num_ensembles = 5
y_pred = 0.0
for i in tqdm(range(num_ensembles)):
    model = CatBoostRegressor(iterations=630,
                              learning_rate=0.03,
                              depth=6,
                              l2_leaf_reg=3,
                              loss_function='MAE',
                              eval_metric='MAE',
                              random_seed=i)
    model.fit(X_train, y_train, cat_features=cat_feature_inds)
    y_pred += model.predict(X_test)
y_pred /= num_ensembles

submission = pd.DataFrame({
    'ParcelId': test_df['ParcelId'],
})
test_dates = {
    '201610': pd.Timestamp('2016-09-30'),
    '201611': pd.Timestamp('2016-10-31'),
    '201612': pd.Timestamp('2016-11-30'),
    '201710': pd.Timestamp('2017-09-30'),
Exemplo n.º 26
0
This script will be my
submission script
"""
# Importing some libraries
import pandas as pd
from catboost import CatBoostRegressor
import numpy as np

# Getting the submission set
raw_sub = pd.read_csv('/Users/jinalshah/Jinal/Github Repos/House-Prices-Challenge-Solution'+
                      '/Data/Raw-Data/test.csv')
ids = raw_sub['Id']
sub_prep = pd.read_csv('/Users/jinalshah/Jinal/Github Repos/House-Prices-Challenge-Solution'+
                      '/Data/Prepared Data/prepared-submission-data.csv')
# Loading the Model
model = CatBoostRegressor()
model.load_model('/Users/jinalshah/Jinal/Github Repos/House-Prices-Challenge-Solution'+
                      '/Models/tuned_catboost1.cbm')

# Making predictions
pred = model.predict(sub_prep)

# Building a dataframe
final_sub = pd.DataFrame()
final_sub['Id'] = ids
final_sub['SalePrice'] = np.expm1(pred) # I need to raise the predictions to e bc I performed log on them

# Putting Submission into a CSV file
final_sub.to_csv(path_or_buf='/Users/jinalshah/Jinal/Github Repos/House-Prices-Challenge-Solution'+
                      '/Submissions/submission10.csv', index=False)
Exemplo n.º 27
0
def main():

    # Loading Data
    training_data = pd.read_csv(
        r'../input/tcd-ml-comp-201920-income-pred-group/tcd-ml-1920-group-income-train.csv',
        sep=',',
        error_bad_lines=False,
        index_col=False,
        low_memory=False).drop_duplicates()
    predict_data = pd.read_csv(
        r'../input/tcd-ml-comp-201920-income-pred-group/tcd-ml-1920-group-income-test.csv',
        sep=',',
        error_bad_lines=False,
        index_col=False,
        low_memory=False)

    # Preprocessing
    training_data, predict_data = preprocess(training_data, predict_data)

    # Renaming columns and making all categorical feature values to lowercase
    training_data = rename_and_lower(training_data)
    predict_data = rename_and_lower(predict_data)

    # Handeling null values
    training_data = impute(training_data)
    predict_data = impute(predict_data)

    y = training_data['total_yearly_income']

    # Combining Training and Test data sets to get all possible values of a categorical feature
    train_plus_test = pd.concat(objs=[training_data, predict_data],
                                axis=0,
                                sort=True)

    # Making non numeric columns as CategoricalDtype
    for column in train_plus_test.select_dtypes(include=[np.object]).columns:
        training_data[column] = training_data[column].astype(
            CategoricalDtype(categories=train_plus_test[column].unique()))
        predict_data[column] = predict_data[column].astype(
            CategoricalDtype(categories=train_plus_test[column].unique()))

    X = training_data.drop(columns=['total_yearly_income'])

    # Split data into train and validate datasets
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    # One Hot Encode Categorical features
    X_train = pd.get_dummies(X_train, prefix_sep='_', drop_first=True)
    X_test = pd.get_dummies(X_test, prefix_sep='_', drop_first=True)

    # Initialize Model parameters
    #categorical_features_indices = np.where(x_train.dtypes != np.float)[0]
    model = CatBoostRegressor(iterations=7000,
                              depth=4,
                              learning_rate=0.03,
                              loss_function='MAE',
                              verbose=1000,
                              od_type="Iter",
                              od_wait=500,
                              use_best_model=True,
                              task_type='GPU')

    # Train model with labelled dataset
    model.fit(X_train, y_train, eval_set=(X_test, y_test), plot=True)

    # Run rediction on validattion data split and check MAE
    j_validate = model.predict(X_test)
    print("Mean Absolute Error: ",
          mean_absolute_error(np.exp(y_test), np.exp(j_validate)))

    prediction = predictIncome(predict_data, model)

    output_file = '../input/tcd-ml-comp-201920-income-pred-group/tcd-ml-1920-group-income-submission.csv'

    # Write prediction to output file
    writeOutput(prediction, output_file)

    # create a link to download the dataframe which was saved with .to_csv method
    create_download_link(filename=output_file)
Exemplo n.º 28
0
def train_model_regression(X,
                           X_test,
                           y,
                           params,
                           folds,
                           model_type='lgb',
                           eval_metric='mae',
                           columns=None,
                           plot_feature_importance=False,
                           model=None,
                           verbose=10000,
                           early_stopping_rounds=200,
                           n_estimators=50000):
    """
    A function to train a variety of regression models.
    Returns dictionary with oof predictions, test predictions, scores and, if necessary, feature importances.
    
    :params: X - training data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: X_test - test data, can be pd.DataFrame or np.ndarray (after normalizing)
    :params: y - target
    :params: folds - folds to split data
    :params: model_type - type of model to use
    :params: eval_metric - metric to use
    :params: columns - columns to use. If None - use all columns
    :params: plot_feature_importance - whether to plot feature importance of LGB
    :params: model - sklearn model, works only for "sklearn" model type
    
    """
    columns = X.columns if columns is None else columns
    X_test = X_test[columns]

    # to set up scoring parameters
    metrics_dict = {
        'mae': {
            'lgb_metric_name': 'mae',
            'catboost_metric_name': 'MAE',
            'sklearn_scoring_function': metrics.mean_absolute_error
        },
        'group_mae': {
            'lgb_metric_name': 'mae',
            'catboost_metric_name': 'MAE',
            'scoring_function': group_mean_log_mae
        },
        'mse': {
            'lgb_metric_name': 'mse',
            'catboost_metric_name': 'MSE',
            'sklearn_scoring_function': metrics.mean_squared_error
        }
    }

    result_dict = {}

    # out-of-fold predictions on train data
    oof = np.zeros(len(X))

    # averaged predictions on train data
    prediction = np.zeros(len(X_test))

    # list of scores on folds
    scores = []
    feature_importance = pd.DataFrame()

    # split and train on folds
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X)):
        print(f'Fold {fold_n + 1} started at {time.ctime()}')
        if type(X) == np.ndarray:
            X_train, X_valid = X[columns][train_index], X[columns][valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
        else:
            X_train, X_valid = X[columns].iloc[train_index], X[columns].iloc[
                valid_index]
            y_train, y_valid = y.iloc[train_index], y.iloc[valid_index]

        if model_type == 'lgb':
            model = lgb.LGBMRegressor(**params,
                                      n_estimators=n_estimators,
                                      n_jobs=-1)
            model.fit(X_train,
                      y_train,
                      eval_set=[(X_train, y_train), (X_valid, y_valid)],
                      eval_metric=metrics_dict[eval_metric]['lgb_metric_name'],
                      verbose=verbose,
                      early_stopping_rounds=early_stopping_rounds)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)

        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train,
                                     label=y_train,
                                     feature_names=X.columns)
            valid_data = xgb.DMatrix(data=X_valid,
                                     label=y_valid,
                                     feature_names=X.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data,
                              num_boost_round=20000,
                              evals=watchlist,
                              early_stopping_rounds=200,
                              verbose_eval=verbose,
                              params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid,
                                                     feature_names=X.columns),
                                         ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test,
                                               feature_names=X.columns),
                                   ntree_limit=model.best_ntree_limit)

        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)

            y_pred_valid = model.predict(X_valid).reshape(-1, )
            score = metrics_dict[eval_metric]['sklearn_scoring_function'](
                y_valid, y_pred_valid)
            print(f'Fold {fold_n}. {eval_metric}: {score:.4f}.')
            print('')

            y_pred = model.predict(X_test).reshape(-1, )

        if model_type == 'cat':
            model = CatBoostRegressor(
                iterations=20000,
                eval_metric=metrics_dict[eval_metric]['catboost_metric_name'],
                **params,
                loss_function=metrics_dict[eval_metric]
                ['catboost_metric_name'])
            model.fit(X_train,
                      y_train,
                      eval_set=(X_valid, y_valid),
                      cat_features=[],
                      use_best_model=True,
                      verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)

        oof[valid_index] = y_pred_valid.reshape(-1, )
        if eval_metric != 'group_mae':
            scores.append(
                metrics_dict[eval_metric]['sklearn_scoring_function'](
                    y_valid, y_pred_valid))
        else:
            scores.append(metrics_dict[eval_metric]['scoring_function'](
                y_valid, y_pred_valid, X_valid['type']))

        prediction += y_pred

        if model_type == 'lgb' and plot_feature_importance:
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)

    prediction /= folds.n_splits

    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(
        np.mean(scores), np.std(scores)))

    result_dict['oof'] = oof
    result_dict['prediction'] = prediction
    result_dict['scores'] = scores

    if model_type == 'lgb':
        if plot_feature_importance:
            feature_importance["importance"] /= folds.n_splits
            cols = feature_importance[[
                "feature", "importance"
            ]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[
                feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12))
            sns.barplot(x="importance",
                        y="feature",
                        data=best_features.sort_values(by="importance",
                                                       ascending=False))
            plt.title('LGB Features (avg over folds)')

            result_dict['feature_importance'] = feature_importance

    return result_dict
Exemplo n.º 29
0
def test_predict_sklearn_regress():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostRegressor(iterations=2, random_seed=0)
    model.fit(train_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 30
0
print('saving submission...')
now_time = time.strftime("%m-%d %H_%M_%S", time.localtime()) 
lgb_sub[["uid","lgb_loan_sum"]].to_csv("./submission/" +now_time+'_lightgbm_Vscore_' + str(valid_score) + '.csv', index=False, header=False)





from catboost import Pool, CatBoostRegressor

train_pool = Pool(train_df[features], train_df["loan_sum"])
test_pool = Pool(valid_df[features], valid_df["loan_sum"]) 
dtrain_all_pool = Pool(tr_user[features], tr_user["loan_sum"])
dtest_pool = Pool(ts_user[features])

catb = CatBoostRegressor(iterations=300, depth=3, learning_rate=0.05, loss_function='RMSE')
catb.fit(train_pool)
print('catb train rmse: %g' % sqrt(mean_squared_error(train_df["loan_sum"], catb.predict( train_pool))))
valid_score = sqrt(mean_squared_error(valid_df["loan_sum"], catb.predict( Pool(valid_df[features]))))
print('catb valid rmse: %g' % valid_score)

catb = CatBoostRegressor(iterations=300, depth=3, learning_rate=0.05, loss_function='RMSE')
catb.fit(dtrain_all_pool)

##提交文件
pred = catb.predict(dtest_pool)
id_test = ts_user['uid']
catb_sub = pd.DataFrame({'uid': id_test, 'catb_loan_sum': pred})
print(catb_sub.describe())
catb_sub.loc[catb_sub["catb_loan_sum"] < 0,"catb_loan_sum"] = 0
print('saving submission...')
Exemplo n.º 31
0
def test_regression_ctr():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostRegressor(iterations=5, random_seed=0, ctr_description=['Borders:TargetBorderCount=5:TargetBorderType=Uniform', 'Counter'])
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 32
0
    def predict(self, X, y=None, **kwargs):
        model, features, importances, iterations = self.get_model_properties()
        if not self._save_by_pickle:
            from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType
            if self.num_classes >= 2:
                from_file = CatBoostClassifier()
            else:
                from_file = CatBoostRegressor()
            with open(self.model_path, mode='wb') as f:
                f.write(model)
            model = from_file.load_model(self.model_path)

        # FIXME: Do equivalent throttling of predict size like def _predict_internal(self, X, **kwargs), wrap-up.
        if isinstance(X, dt.Frame) and len(self.params['cat_features']) == 0:
            # dt -> lightgbm internally using buffer leaks, so convert here
            # assume predict is after pipeline collection or in subprocess so needs no protection
            X = X.to_numpy(
            )  # don't assign back to X so don't damage during predict
            X = np.ascontiguousarray(X,
                                     dtype=np.float32 if config.data_precision
                                     == "float32" else np.float64)

        X, eval_set = self.process_cats(X, None, self.feature_names_fitted)

        pred_contribs = kwargs.get('pred_contribs', False)
        output_margin = kwargs.get('output_margin', False)
        fast_approx = kwargs.pop('fast_approx', False)
        if fast_approx:
            iterations = min(config.fast_approx_num_trees, iterations)

        # implicit import
        from catboost import CatBoostClassifier, CatBoostRegressor, EFstrType, Pool
        n_jobs = max(1, physical_cores_count)
        if not pred_contribs and not output_margin:
            if self.num_classes >= 2:
                preds = model.predict_proba(
                    X,
                    ntree_start=0,
                    ntree_end=iterations,  # index of first tree *not* to be used
                    thread_count=self.params_base.get(
                        'n_jobs', n_jobs),  # -1 is not supported
                )

                if preds.shape[1] == 2:
                    return preds[:, 1]
                else:
                    return preds
            else:
                return model.predict(
                    X,
                    ntree_start=0,
                    ntree_end=iterations,  # index of first tree *not* to be used
                    thread_count=self.params_base.get(
                        'n_jobs', n_jobs),  # -1 is not supported
                )
        elif output_margin:
            # uses "predict" for raw for any class
            preds = model.predict(
                X,
                prediction_type="RawFormulaVal",
                ntree_start=0,
                ntree_end=iterations,  # index of first tree *not* to be used
                thread_count=self.params_base.get(
                    'n_jobs', n_jobs),  # -1 is not supported
            )
            if len(preds.shape
                   ) > 1 and preds.shape[1] == 2 and self.num_classes == 2:
                return preds[:, 1]
            else:
                return preds
        elif pred_contribs:
            # For Shapley, doesn't come from predict
            # For regression/binary, shap is shape of (rows, features + bias)
            # for multiclass, shap is shape of (rows, classes, features + bias)
            data = Pool(X, label=y, cat_features=self.params['cat_features'])
            if fast_approx:
                # https://github.com/catboost/catboost/issues/1146
                # https://github.com/catboost/catboost/issues/1535
                # can't specify trees, but they have approx version
                # Regular, Exact, or Approximate
                shap_calc_type = "Approximate"
            else:
                shap_calc_type = "Regular"
            # See also shap_mode
            # help(CatBoostClassifier.get_feature_importance)
            print_debug("shap_calc_type: %s" % shap_calc_type)

            pickle_path = None
            if config.debug_daimodel_level >= 2:
                self.uuid = str(uuid.uuid4())[:6]
                pickle_path = os.path.join(
                    exp_dir(), "catboost_shappredict%s.tmp.pickle" % self.uuid)
                model.save_model(
                    os.path.join(exp_dir(), "catshapproblem%s.catboost.model" %
                                 self.uuid))
                # save_obj((self, self.model, model, X, y, kwargs, shap_calc_type, self.params['cat_features']), pickle_path)
                save_obj((model, X, y, kwargs, shap_calc_type,
                          self.params['cat_features']), pickle_path)

            preds_shap = model.get_feature_importance(
                data=data,
                thread_count=self.params_base.get(
                    'n_jobs', n_jobs),  # -1 is not supported,
                type=EFstrType.ShapValues,
                shap_calc_type=shap_calc_type,
            )
            # repair broken shap sum: https://github.com/catboost/catboost/issues/1125
            print_debug("shap_fix")
            preds_raw = model.predict(
                X,
                prediction_type="RawFormulaVal",
                ntree_start=0,
                ntree_end=iterations,  # index of first tree *not* to be used
                thread_count=self.params_base.get(
                    'n_jobs', n_jobs),  # -1 is not supported
            )
            if self.num_classes <= 2:
                axis = 1
            else:
                axis = 2
            orig_sum = np.sum(preds_shap, axis=axis)
            print_debug("shap_fix2")
            # avoid division by 0, need different trick, e.g. change baseline, to fix that case
            if axis == 1:
                orig_sum[orig_sum[:] == 0.0] = 1.0
                preds_shap = preds_shap * preds_raw[:, None] / orig_sum[:,
                                                                        None]
            else:
                # each feature and each class must sum up
                orig_sum[orig_sum[:, :] == 0.0] = 1.0
                preds_shap = preds_shap * preds_raw[:, :,
                                                    None] / orig_sum[:, :,
                                                                     None]

            if config.hard_asserts and config.debug_daimodel_level >= 2:
                print_debug("shap_check")
                model.save_model(os.path.join(exp_dir(), "catshapproblem"))
                pickle.dump((X, y, self.params['cat_features']),
                            open(os.path.join(exp_dir(), "catshapproblem.pkl"),
                                 "wb"))
                preds_raw = model.predict(
                    X,
                    prediction_type="RawFormulaVal",
                    ntree_start=0,
                    ntree_end=iterations,  # index of first tree *not* to be used
                    thread_count=self.params_base.get(
                        'n_jobs', n_jobs),  # -1 is not supported
                )

                assert np.isclose(preds_raw, np.sum(
                    preds_shap, axis=axis)).all(
                    ), "catboost shapley does not sum up correctly"

            if config.debug_daimodel_level <= 2:
                remove(pickle_path)

            if axis == 1:
                return preds_shap
            else:
                # DAI expects (shape rows) * (classes x (features + 1)) with "columns" as blocks of
                # feature_0_class_0 feature_0_class_0 ... feature_0_class_1 feature_1_class_1 ...
                return preds_shap.reshape(
                    preds_shap.shape[0],
                    preds_shap.shape[1] * preds_shap.shape[2])
        else:
            raise RuntimeError("No such case")
            flag = False
            break
    return flag


from google.colab import files

uploaded = files.upload()

X_train = (pd.read_csv('h_10_X_train.csv')).values
X_test = (pd.read_csv('h_10_X_test.csv')).values
X_val = (pd.read_csv('h_10_X_val.csv')).values
Y_train = (pd.read_csv('h_10_Y_train.csv')['0']).values
Y_test = (pd.read_csv('h_10_Y_test.csv')['0']).values
Y_val = (pd.read_csv('h_10_Y_val.csv')['0']).values
model = CatBoostRegressor(iterations=100)
model.fit(X_train, Y_train, verbose=False)

print(model.score(X_val, Y_val))

num_of_individs = 12
best_indiv = [[0 for i in range(len(X_train[0]))] for j in range(num_of_individs)]
for i in range(num_of_individs):
    for j in range(len(X_train[0])):
        best_indiv[i][j] = 1

ds = Dataset(X_train, Y_train, X_test, Y_test, X_val, Y_val)
current_set = []
for i in range(num_of_individs):
    current_set.append(individual(best_indiv[i]))
    current_set[i].get_score(ds.X_train, ds.Y_train, ds.X_test, ds.Y_test, ds.X_val, ds.Y_val)
Exemplo n.º 34
0
def test_invalid_loss_regressor():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostRegressor(loss_function="fee")
        model.fit(pool)
Exemplo n.º 35
0
                fold_count,
                feature="fc",
                model_type=MODEL_TYPE,
            )
            DEPTH = 7
            update_tracking(run_id, "depth", DEPTH)
            train_dataset = Pool(data=X_train, label=y_train)
            valid_dataset = Pool(data=X_valid, label=y_valid)
            test_dataset = Pool(data=X_test_type)
            model = CatBoostRegressor(
                iterations=N_ESTIMATORS,
                learning_rate=LEARNING_RATE,
                depth=DEPTH,
                eval_metric=EVAL_METRIC,
                verbose=VERBOSE,
                random_state=RANDOM_STATE,
                thread_count=N_THREADS,
                # loss_function=EVAL_METRIC,
                # bootstrap_type='Poisson',
                # bagging_temperature=5,
                task_type="GPU",
            )  # Train on GPU

            model.fit(
                train_dataset,
                eval_set=valid_dataset,
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            )
            now = timer()
            update_tracking(
                run_id,
Exemplo n.º 36
0
def fit_meta_feature(
    X_train,
    X_valid,
    X_test,
    Meta_train,
    train_idx,
    bond_type,
    base_fold,
    feature="fc",
    N_META_FOLDS=N_META_FOLDS,
    N_META_ESTIMATORS=N_META_ESTIMATORS,
    model_type="catboost",
):
    """
    Adds meta features to train, test and val
    """
    logger.info(f"Creating meta feature {feature}")
    logger.info("X_train, X_valid and X_test are shapes {} {} {}".format(
        X_train.shape, X_valid.shape, X_test.shape))
    folds = GroupKFold(n_splits=N_META_FOLDS)
    fold_count = 1

    # Init predictions
    X_valid["meta_" + feature] = 0
    X_test["meta_" + feature] = 0
    X_train["meta_" + feature] = 0
    X_train_oof = X_train[["meta_" + feature]].copy()
    X_train = X_train.drop("meta_" + feature, axis=1)
    feature_importance = pd.DataFrame()
    for fold_n, (train_idx2, valid_idx2) in enumerate(
            folds.split(X_train,
                        groups=mol_group_type.iloc[train_idx].values)):
        logger.info("Running Meta Feature Type {} - Fold {} of {}".format(
            feature, fold_count, folds.n_splits))
        update_tracking(run_id, "{}_meta_{}_est".format(bond_type, feature),
                        N_META_ESTIMATORS)
        update_tracking(run_id,
                        "{}_meta_{}_metafolds".format(bond_type,
                                                      feature), N_META_FOLDS)

        X_train2 = X_train.loc[X_train.reset_index().index.isin(train_idx2)]
        X_valid2 = X_train.loc[X_train.reset_index().index.isin(valid_idx2)]
        X_train2 = X_train2.copy()
        X_valid2 = X_valid2.copy()
        y_train2 = Meta_train.loc[Meta_train.reset_index().index.isin(
            train_idx2)][feature]
        y_valid2 = Meta_train.loc[Meta_train.reset_index().index.isin(
            valid_idx2)][feature]
        fold_count += 1

        if model_type == "catboost":
            train_dataset = Pool(data=X_train2, label=y_train2)
            metavalid_dataset = Pool(data=X_valid2, label=y_valid2)
            valid_dataset = Pool(data=X_valid)
            test_dataset = Pool(data=X_test)
            model = CatBoostRegressor(
                iterations=N_META_ESTIMATORS,
                learning_rate=LEARNING_RATE,
                depth=META_DEPTH,
                eval_metric=EVAL_METRIC,
                verbose=VERBOSE,
                random_state=RANDOM_STATE,
                thread_count=N_THREADS,
                task_type="GPU",
            )  # Train on GPU

            model.fit(
                train_dataset,
                eval_set=metavalid_dataset,
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            )
            y_pred_meta_valid = model.predict(metavalid_dataset)
            y_pred_valid = model.predict(valid_dataset)
            y_pred = model.predict(test_dataset)

            X_train_oof.loc[X_train_oof.reset_index().index.isin(valid_idx2),
                            "meta_" + feature] = y_pred_meta_valid
            X_valid["meta_" + feature] += y_pred_valid
            X_test["meta_" + feature] += y_pred

            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X_train.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["type"] = bond_type
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)
        elif model_type == "xgboost":
            model = xgboost.XGBRegressor(**xgb_params)
            model.fit(
                X_train2,
                y_train2,
                eval_metric=EVAL_METRIC,
                eval_set=[(X_valid2, y_valid2)],
                verbose=VERBOSE,
                early_stopping_rounds=EARLY_STOPPING_ROUNDS,
            )

            y_pred_meta_valid = model.predict(X_valid2)
            y_pred_valid = model.predict(
                X_valid.drop("meta_" + feature, axis=1))
            y_pred = model.predict(X_test.drop("meta_" + feature, axis=1))

            X_train_oof.loc[X_train_oof.reset_index().index.isin(valid_idx2),
                            "meta_" + feature] = y_pred_meta_valid
            X_valid["meta_" + feature] += y_pred_valid
            X_test["meta_" + feature] += y_pred

            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X_train.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["type"] = bond_type
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)
    oof_score = mean_absolute_error(Meta_train[feature],
                                    X_train_oof["meta_" + feature])
    log_oof_score = np.log(oof_score)
    logger.info(
        f"Meta feature {feature} has MAE {oof_score:0.4f} LMAE {log_oof_score:0.4f}"
    )
    update_tracking(
        run_id, "{}_meta_{}_mae_cv_f{}".format(bond_type, feature, base_fold),
        oof_score)
    update_tracking(
        run_id,
        "{}_meta_{}_lmae_cv_f{}".format(bond_type, feature, base_fold),
        log_oof_score,
    )

    feature_importance.to_parquet(
        "type_results/{}/meta/{}_{}_{}_fi_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))

    X_train_oof.to_parquet(
        "type_results/{}/meta/{}_{}_{}_oof_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))

    X_train.to_parquet(
        "type_results/{}/meta/{}_{}_{}_X_train_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))

    X_valid.to_parquet(
        "type_results/{}/meta/{}_{}_{}_X_valid_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))

    X_valid.to_parquet(
        "type_results/{}/meta/{}_{}_{}_X_valid_meta_{}_f{}_{:0.4f}MAE_{:0.4f}LMAE.parquet"
        .format(
            bond_type,
            MODEL_NUMBER,
            run_id,
            bond_type,
            feature,
            base_fold,
            oof_score,
            log_oof_score,
        ))

    X_valid["meta_" + feature] = X_valid["meta_" + feature] / N_META_FOLDS
    X_test["meta_" + feature] = X_test["meta_" + feature] / N_META_FOLDS
    X_train["meta_" + feature] = X_train_oof["meta_" + feature]
    logger.info("Done creating meta features")
    logger.info("X_train, X_valid and X_test are shapes {} {} {}".format(
        X_train.shape, X_valid.shape, X_test.shape))
    return X_train, X_valid, X_test
Exemplo n.º 37
0
)  #oluşturmuş olduğumuz dumm değişkenleri ve bağımsız değişkenleri bir araya getirme işlemi
#yukarda yapılan işlemler kategorik değişkenleri dumm çevirerek veri setinde tutup diğer bağımsız değişkenlerle birleştirdik

#aşağıda eğitim ve deneme seti olarak ayrıştırma işlemi yaptık
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.25,
                                                    random_state=42)

print(df.head())
print(df.shape)
print(X_train.head())

#Model ve Tahmin
#%%
cat = CatBoostRegressor()
cat_model = cat.fit(X_train, y_train)
print(cat_model)

y_pred = cat_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(rmse)

#Model Tuning
#%%
cat_params = {
    "iterations": [200, 500, 1000],
    "learning_rate": [0.01, 0.1],
    "depth": [3, 6, 8]
}
gs = GridSearchCV(cat, cat_params, cv=5, n_jobs=-1, verbose=2)