Exemplo n.º 1
0
def test_load_generated():
    pool_size = (100, 10)
    data = np.round(np.random.normal(size=pool_size), decimals=3)
    label = np.random.randint(2, size=pool_size[0])
    pool = Pool(data, label)
    assert _check_data(pool.get_features(), data)
    assert _check_data(pool.get_label(), label)
Exemplo n.º 2
0
def test_load_df_vs_load_from_file():
    pool1 = Pool(TRAIN_FILE, column_description=CD_FILE)
    data = read_table(TRAIN_FILE, header=None, dtype=str)
    label = DataFrame(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    cat_features = pool1.get_cat_feature_indices()
    pool2 = Pool(np.array(data), label, cat_features)
    assert pool1 == pool2
Exemplo n.º 3
0
def test_zero_baseline():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    baseline = np.zeros(pool.num_row())
    pool.set_baseline(baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 4
0
def test_non_ones_weight():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    weight = np.arange(1, pool.num_row()+1)
    pool.set_weight(weight)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 5
0
def test_load_df():
    pool = Pool(NAN_TRAIN_FILE, column_description=NAN_CD_FILE)
    data = read_table(NAN_TRAIN_FILE, header=None)
    label = DataFrame(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    cat_features = pool.get_cat_feature_indices()
    pool2 = Pool(data, label, cat_features)
    assert _check_data(pool.get_features(), pool2.get_features())
    assert _check_data(pool.get_label(), pool2.get_label())
Exemplo n.º 6
0
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool.set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass")
    data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices())
    model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 7
0
def test_load_series():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    data = read_table(TRAIN_FILE, header=None)
    label = Series(data.iloc[:, TARGET_IDX])
    data.drop([TARGET_IDX], axis=1, inplace=True)
    data = Series(list(data.values))
    cat_features = pool.get_cat_feature_indices()
    pool2 = Pool(data, label, cat_features)
    assert _check_data(pool.get_features(), pool2.get_features())
    assert _check_data(pool.get_label(), pool2.get_label())
Exemplo n.º 8
0
def test_pool_after_fit():
    pool1 = Pool(TRAIN_FILE, column_description=CD_FILE)
    pool2 = Pool(TRAIN_FILE, column_description=CD_FILE)
    assert _check_data(pool1.get_features(), pool2.get_features())
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool2)
    assert _check_data(pool1.get_features(), pool2.get_features())
Exemplo n.º 9
0
def test_no_cat_in_predict():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    test_pool = Pool(TEST_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    pred1 = model.predict(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()))
    pred2 = model.predict(Pool(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()), cat_features=train_pool.get_cat_feature_indices()))
    assert _check_data(pred1, pred2)
Exemplo n.º 10
0
def test_load_dumps():
    pool_size = (100, 10)
    data = np.random.randint(10, size=pool_size)
    label = np.random.randint(2, size=pool_size[0])
    pool1 = Pool(data, label)
    lines = []
    for i in range(len(data)):
        line = [str(label[i])] + [str(x) for x in data[i]]
        lines.append('\t'.join(line))
    text = '\n'.join(lines)
    with open('test_data_dumps', 'w') as f:
        f.write(text)
    pool2 = Pool('test_data_dumps')
    assert _check_data(pool1.get_features(), pool2.get_features())
    assert _check_data(pool1.get_label(), pool2.get_label())
Exemplo n.º 11
0
def test_fit_no_label():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.fit(pool.get_features())
Exemplo n.º 12
0
def main_task():

    # Process
    if manual_entry:
        tmp_df = get_sample(df, 1)
    else:
        tmp_df = get_sample(df, num_prediccions)


    #tmp_df.reset_index(inplace=True)
    #st.text(tmp_df.head())
    res = process_data(tmp_df, cols_input, cols_empty, manual_entry)

    #test_real_labels = res['Pedido_real']
    test_real_labels = res.pop('Pedido_real')
    _ = res.pop('Pedido real')
    # Apply MinMaxScaler
    #scaler_data_ = np.load("./data/my_scaler.npy")
    #scaler_scale, scaler_min = scaler_data_[0], scaler_data_[1]

    #test_real_labels_scaled = test_real_labels * scaler_scale
    #test_real_labels_scaled += scaler_min


    #st.text("Real {}".format(tmp_df['Pedido real']))

    #res.drop(columns=['Tipo Articulo', 'Fecha'], inplace=True)
    #st.table(tmp_df)

    cat_features = np.where((res.dtypes != 'float32') & (res.dtypes != 'float64'))[0]
    #test_data = get_pool(res, test_real_labels_scaled, cat_features)
    #test_data = Pool(res, test_real_labels_scaled, cat_features=cat_features)
    test_data = Pool(res, test_real_labels, cat_features=cat_features)


    model = get_model()
    preds_scaled = model.predict(test_data)

    preds = preds_scaled

    #preds = preds_scaled - scaler_min
    #preds /= scaler_scale


    st.text("Predicción de pedidos: {}".format([int(np.round(p)) for p in preds]))
    st.text("Pedido real realizado: {}".format([p for p in test_real_labels]))



    resultados = pd.DataFrame(np.stack([[int(np.round(p)) for p in preds],np.array(test_real_labels)], axis=1), columns=['Predicciones','Pedidos reales'])
    fig = go.Figure()

    fig.add_trace(go.Scatter(x=resultados.index, y=resultados['Pedidos reales'],
                        mode='lines+markers',
                        name='Pedidos reales'))
    fig.add_trace(go.Scatter(x=resultados.index, y=resultados['Predicciones'],
                        mode='lines+markers',
                        name='Predicciones'))

    st.plotly_chart(fig, use_container_width=True)
    shap_values = model.get_feature_importance(
        data=test_data,
        type='ShapValues',
        shap_calc_type='Approximate'
    )
    #sp_shape = shap_values.shape
    #st.text(shap_values)
    #st.text(shap_values.shape)
    #spv = shap_values.ravel()
    #st.text(spv)
    #st.text(spv.shape)
    #spv = spv - scaler_min
    #spv /= scaler_scale
    #st.text(spv)
    #st.text(spv.shape)
    #st.text(spv.reshape(sp_shape))
    #return test_data, res, shap_values.reshape(sp_shape), tmp_df, fig
    return test_data, res, shap_values, tmp_df, fig
#print(clf.predict(X_test[104]))
cm = confusion_matrix(y_test, y_pred)

from sklearn.metrics import recall_score, precision_score

print(recall_score(y_test, y_pred, average='macro'))

print(precision_score(y_test, y_pred, average='micro'))

print(accuracy_score(y_test, y_pred))

#cr0ss validati0n

cv_params = clf.get_params()
cv_params.update({'loss_function': 'Logloss'})
cv_data = cv(Pool(X, y, cat_features=cat_featuresind), cv_params, plot=True)

print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])))

print('Precise validation accuracy score: {}'.format(
    np.max(cv_data['test-Accuracy-mean'])))
"""

importances = clf.feature_importances_
print(clf.feature_importances_)
plt.title('Feature Importances ')
plt.barh(range(len(cat_featuresind)), importances[cat_featuresind], color='b', align='center')
#plt.yticks(dataset[i][0] for i in cat_featuresind)
Exemplo n.º 14
0
 def fit(self, Xc: FloatTensor, Xe: LongTensor, y: FloatTensor):
     Xc, Xe, y = filter_nan(Xc, Xe, y, 'all')
     train_data = Pool(data=self.xtrans(Xc=Xc, Xe=Xe),
                       label=y.numpy().reshape(-1))
     self.model.fit(train_data)
Exemplo n.º 15
0
def test_fit_data():
    pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE)
    eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE)
    base_model = CatBoostClassifier(iterations=2,
                                    random_seed=0,
                                    loss_function="MultiClass")
    base_model.fit(pool)
    baseline = np.array(
        base_model.predict(pool, prediction_type='RawFormulaVal'))
    eval_baseline = np.array(
        base_model.predict(eval_pool, prediction_type='RawFormulaVal'))
    eval_pool._set_baseline(eval_baseline)
    model = CatBoostClassifier(iterations=2,
                               random_seed=0,
                               loss_function="MultiClass")
    data = map_cat_features(pool.get_features(),
                            pool.get_cat_feature_indices())
    model.fit(data,
              pool.get_label(),
              pool.get_cat_feature_indices(),
              sample_weight=np.arange(1,
                                      pool.num_row() + 1),
              baseline=baseline,
              use_best_model=True,
              eval_set=eval_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return local_canonical_file(OUTPUT_MODEL_PATH)
Exemplo n.º 16
0
x_train, x_valid, y_train, y_valid = train_test_split(train_x,
                                                      train_y,
                                                      test_size=0.2,
                                                      random_state=2019)

y_pred_lgb = np.zeros(len(x_test))

catboost_model = CatBoostRegressor(custom_metric='MAE',
                                   eval_metric='MAE',
                                   learning_rate=0.1,
                                   l2_leaf_reg=5,
                                   early_stopping_rounds=100,
                                   num_trees=2000,
                                   loss_function='MAE',
                                   verbose=True)
train_pool = Pool(x_train, y_train)
val_pool = Pool(x_valid, y_valid)
catboost_model.fit(train_pool, eval_set=val_pool, verbose_eval=100)

test_pool = Pool(x_test)
y_pred_lgb = catboost_model.predict(test_pool)

result = pd.read_csv('/cos_person/tencent/train/test_id.csv')
['sample_id', 'ad_id']

result['ecpm'] = y_pred_lgb
result_tmp = result[:10]
request_ecpm = pd.read_csv('/cos_person/tencent/train/max_total.csv',
                           header=None)
request_ecpm.columns = [
    'Ad_Request_id', 'Ad_Request_Time', 'user_id', 'Ad_pos_id', 'test_ad_id',
Exemplo n.º 17
0
def test_predict_without_fit():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.predict(pool)
Exemplo n.º 18
0
def test_fit_no_label():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier()
        model.fit(pool.get_features())
Exemplo n.º 19
0
def test_invalid_loss_regressor():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostRegressor(loss_function="fee")
        model.fit(pool)
Exemplo n.º 20
0
def test_invalid_loss_classifier():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoostClassifier(loss_function="abcdef")
        model.fit(pool)
Exemplo n.º 21
0
def test_python_export_with_cat_features():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoost({'iterations': 20, 'random_seed': 0})
    model.fit(train_pool)
    model.save_model(OUTPUT_PYTHON_MODEL_PATH, format="python")
    return local_canonical_file(OUTPUT_PYTHON_MODEL_PATH)
Exemplo n.º 22
0
def test_predict_sklearn_regress():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostRegressor(iterations=2, random_seed=0)
    model.fit(train_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return compare_canonical_models(OUTPUT_MODEL_PATH)
Exemplo n.º 23
0
def test_pool_cat_features():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    assert np.all(pool.get_cat_feature_indices() == CAT_FEATURES)
Exemplo n.º 24
0
    def _fit(self,
             X,
             y,
             X_val=None,
             y_val=None,
             time_limit=None,
             num_gpus=0,
             sample_weight=None,
             sample_weight_val=None,
             **kwargs):
        try_import_catboost()
        from catboost import CatBoostClassifier, CatBoostRegressor, Pool
        ag_params = self._get_ag_params()
        params = self._get_model_params()
        if self.problem_type == SOFTCLASS:
            # FIXME: This is extremely slow due to unoptimized metric / objective sent to CatBoost
            from .catboost_softclass_utils import SoftclassCustomMetric, SoftclassObjective
            params['loss_function'] = SoftclassObjective.SoftLogLossObjective()
            params['eval_metric'] = SoftclassCustomMetric.SoftLogLossMetric()

        model_type = CatBoostClassifier if self.problem_type in PROBLEM_TYPES_CLASSIFICATION else CatBoostRegressor
        if isinstance(params['eval_metric'], str):
            metric_name = params['eval_metric']
        else:
            metric_name = type(params['eval_metric']).__name__
        num_rows_train = len(X)
        num_cols_train = len(X.columns)
        if self.problem_type == MULTICLASS:
            if self.num_classes is not None:
                num_classes = self.num_classes
            else:
                num_classes = 10  # Guess if not given, can do better by looking at y
        elif self.problem_type == SOFTCLASS:  # TODO: delete this elif if it's unnecessary.
            num_classes = y.shape[1]
        else:
            num_classes = 1

        # TODO: Add ignore_memory_limits param to disable NotEnoughMemoryError Exceptions
        max_memory_usage_ratio = self.params_aux['max_memory_usage_ratio']
        approx_mem_size_req = num_rows_train * num_cols_train * num_classes / 2  # TODO: Extremely crude approximation, can be vastly improved
        if approx_mem_size_req > 1e9:  # > 1 GB
            available_mem = psutil.virtual_memory().available
            ratio = approx_mem_size_req / available_mem
            if ratio > (1 * max_memory_usage_ratio):
                logger.warning(
                    '\tWarning: Not enough memory to safely train CatBoost model, roughly requires: %s GB, but only %s GB is available...'
                    % (round(approx_mem_size_req / 1e9,
                             3), round(available_mem / 1e9, 3)))
                raise NotEnoughMemoryError
            elif ratio > (0.2 * max_memory_usage_ratio):
                logger.warning(
                    '\tWarning: Potentially not enough memory to safely train CatBoost model, roughly requires: %s GB, but only %s GB is available...'
                    % (round(approx_mem_size_req / 1e9,
                             3), round(available_mem / 1e9, 3)))

        start_time = time.time()
        X = self.preprocess(X)
        cat_features = list(X.select_dtypes(include='category').columns)
        X = Pool(data=X,
                 label=y,
                 cat_features=cat_features,
                 weight=sample_weight)

        if X_val is None:
            eval_set = None
            num_sample_iter_max = 50
            early_stopping_rounds = None
        else:
            X_val = self.preprocess(X_val)
            X_val = Pool(data=X_val,
                         label=y_val,
                         cat_features=cat_features,
                         weight=sample_weight_val)
            eval_set = X_val
            modifier = min(1.0, 10000 / num_rows_train)
            num_sample_iter_max = max(round(modifier * 50), 2)
            early_stopping_rounds = ag_params.get('ag.early_stop', 'auto')
            if isinstance(early_stopping_rounds, str):
                early_stopping_rounds = self._get_early_stopping_rounds(
                    num_rows_train=num_rows_train,
                    strategy=early_stopping_rounds)

        if params.get('allow_writing_files', False):
            if 'train_dir' not in params:
                try:
                    # TODO: What if path is in S3?
                    os.makedirs(os.path.dirname(self.path), exist_ok=True)
                except:
                    pass
                else:
                    params['train_dir'] = self.path + 'catboost_info'

        # TODO: Add more control over these params (specifically early_stopping_rounds)
        verbosity = kwargs.get('verbosity', 2)
        if verbosity <= 1:
            verbose = False
        elif verbosity == 2:
            verbose = False
        elif verbosity == 3:
            verbose = 20
        else:
            verbose = True

        init_model = None
        init_model_tree_count = None
        init_model_best_score = None

        num_features = len(self._features)

        if num_gpus != 0:
            if 'task_type' not in params:
                params['task_type'] = 'GPU'
                logger.log(
                    20,
                    f'\tTraining {self.name} with GPU, note that this may negatively impact model quality compared to CPU training.'
                )
                # TODO: Confirm if GPU is used in HPO (Probably not)
                # TODO: Adjust max_bins to 254?

        if params.get('task_type', None) == 'GPU':
            if 'colsample_bylevel' in params:
                params.pop('colsample_bylevel')
                logger.log(
                    30,
                    f'\t\'colsample_bylevel\' is not supported on GPU, using default value (Default = 1).'
                )
            if 'rsm' in params:
                params.pop('rsm')
                logger.log(
                    30,
                    f'\t\'rsm\' is not supported on GPU, using default value (Default = 1).'
                )

        if self.problem_type == MULTICLASS and 'rsm' not in params and 'colsample_bylevel' not in params and num_features > 1000:
            if time_limit:
                # Reduce sample iterations to avoid taking unreasonable amounts of time
                num_sample_iter_max = max(round(num_sample_iter_max / 2), 2)
            # Subsample columns to speed up training
            if params.get('task_type',
                          None) != 'GPU':  # RSM does not work on GPU
                params['colsample_bylevel'] = max(
                    min(1.0, 1000 / num_features), 0.05)
                logger.log(
                    30,
                    f'\tMany features detected ({num_features}), dynamically setting \'colsample_bylevel\' to {params["colsample_bylevel"]} to speed up training (Default = 1).'
                )
                logger.log(
                    30,
                    f'\tTo disable this functionality, explicitly specify \'colsample_bylevel\' in the model hyperparameters.'
                )
            else:
                params['colsample_bylevel'] = 1.0
                logger.log(
                    30,
                    f'\t\'colsample_bylevel\' is not supported on GPU, using default value (Default = 1).'
                )

        logger.log(15, f'\tCatboost model hyperparameters: {params}')

        if time_limit:
            time_left_start = time_limit - (time.time() - start_time)
            if time_left_start <= time_limit * 0.4:  # if 60% of time was spent preprocessing, likely not enough time to train model
                raise TimeLimitExceeded
            params_init = params.copy()
            num_sample_iter = min(num_sample_iter_max,
                                  params_init['iterations'])
            params_init['iterations'] = num_sample_iter
            self.model = model_type(**params_init, )
            self.model.fit(
                X,
                eval_set=eval_set,
                use_best_model=True,
                verbose=verbose,
                # early_stopping_rounds=early_stopping_rounds,
            )

            init_model_tree_count = self.model.tree_count_
            init_model_best_score = self._get_best_val_score(
                self.model, metric_name)

            time_left_end = time_limit - (time.time() - start_time)
            time_taken_per_iter = (time_left_start -
                                   time_left_end) / num_sample_iter
            estimated_iters_in_time = round(time_left_end /
                                            time_taken_per_iter)
            init_model = self.model

            if self.stopping_metric._optimum == init_model_best_score:
                # Done, pick init_model
                params_final = None
            else:
                params_final = params.copy()

                # TODO: This only handles memory with time_limit specified, but not with time_limit=None, handle when time_limit=None
                available_mem = psutil.virtual_memory().available
                if self.problem_type == SOFTCLASS:  # TODO: remove this once catboost-dev is no longer necessary and SOFTCLASS objectives can be pickled.
                    model_size_bytes = 1  # skip memory check
                else:
                    model_size_bytes = sys.getsizeof(pickle.dumps(self.model))

                max_memory_proportion = 0.3 * max_memory_usage_ratio
                mem_usage_per_iter = model_size_bytes / num_sample_iter
                max_memory_iters = math.floor(
                    available_mem * max_memory_proportion / mem_usage_per_iter)
                if params.get('task_type', None) == 'GPU':
                    # Cant use init_model
                    iterations_left = params['iterations']
                else:
                    iterations_left = params['iterations'] - num_sample_iter
                params_final['iterations'] = min(iterations_left,
                                                 estimated_iters_in_time)
                if params_final[
                        'iterations'] > max_memory_iters - num_sample_iter:
                    if max_memory_iters - num_sample_iter <= 500:
                        logger.warning(
                            '\tWarning: CatBoost will be early stopped due to lack of memory, increase memory to enable full quality models, max training iterations changed to %s from %s'
                            % (max_memory_iters,
                               params_final['iterations'] + num_sample_iter))
                    params_final[
                        'iterations'] = max_memory_iters - num_sample_iter
        else:
            params_final = params.copy()

        if params_final is not None and params_final['iterations'] > 0:
            self.model = model_type(**params_final, )

            fit_final_kwargs = dict(
                eval_set=eval_set,
                verbose=verbose,
                early_stopping_rounds=early_stopping_rounds,
            )

            # TODO: Strangely, this performs different if clone init_model is sent in than if trained for same total number of iterations. May be able to optimize catboost models further with this
            warm_start = False
            if params_final.get('task_type', None) == 'GPU':
                # Cant use init_model
                fit_final_kwargs['use_best_model'] = True
            elif init_model is not None:
                fit_final_kwargs['init_model'] = init_model
                warm_start = True
            self.model.fit(X, **fit_final_kwargs)

            if init_model is not None:
                final_model_best_score = self._get_best_val_score(
                    self.model, metric_name)

                if self.stopping_metric._optimum == init_model_best_score:
                    # Done, pick init_model
                    self.model = init_model
                else:
                    if (init_model_best_score > self.stopping_metric._optimum
                        ) or (final_model_best_score >
                              self.stopping_metric._optimum):
                        init_model_best_score = -init_model_best_score
                        final_model_best_score = -final_model_best_score

                    if warm_start:
                        if init_model_best_score >= final_model_best_score:
                            self.model = init_model
                        else:
                            best_iteration = init_model_tree_count + self.model.get_best_iteration(
                            )
                            self.model.shrink(ntree_start=0,
                                              ntree_end=best_iteration + 1)
                    else:
                        if init_model_best_score >= final_model_best_score:
                            self.model = init_model

        self.params_trained['iterations'] = self.model.tree_count_
Exemplo n.º 25
0
        '%Y-%m-%d') in hk_2018_holidays) * 1
y = dataset['speed']

X = dataset.drop(["id", "date", "speed"], axis=1)
X_train, X_eval, y_train, y_eval = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=1)

# Initialize CatBoostRegressor
model = CatBoostRegressor(iterations=1100,
                          learning_rate=0.09,
                          depth=10,
                          use_best_model=True,
                          l2_leaf_reg=2)
eval_dataset = Pool(X_eval, y_eval)
# Fit model
model.fit(X_train, y_train, eval_set=eval_dataset, verbose=False)
# Get predictions
print(model.get_best_score())

test_set = pd.read_csv('test.csv')
test_set["date"] = pd.to_datetime(test_set["date"], format="%d/%m/%Y %H:%M")
test_set["year"] = test_set["date"].apply(lambda x: x.year)
test_set["month"] = test_set["date"].apply(lambda x: x.month)
test_set["day"] = test_set["date"].apply(lambda x: x.day)
test_set["hour"] = test_set["date"].apply(lambda x: x.hour)
test_set["weekday"] = test_set["date"].apply(lambda x: x.isoweekday())
test_set['hour_of_week'] = (test_set["weekday"] * 24 - 24) + test_set["hour"]
test_set['hour_of_month'] = (test_set["month"] * 24 - 24) + test_set["hour"]
test_set["quarter"] = test_set["date"].apply(lambda x: x.quarter)
Exemplo n.º 26
0
def test_real_numbers_cat_features():
    with pytest.raises(CatboostError):
        data = np.random.rand(100, 10)
        label = np.random.randint(2, size=100)
        Pool(data, label, [1, 2])
Exemplo n.º 27
0
def test_predict_sklearn_class():
    train_pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=2, random_seed=0)
    model.fit(train_pool)
    model.save_model(OUTPUT_MODEL_PATH)
    return local_canonical_file(OUTPUT_MODEL_PATH)
Exemplo n.º 28
0
def test_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.feature_importances_))
    return local_canonical_file(FIMP_PATH)
Exemplo n.º 29
0
def test_invalid_loss():
    with pytest.raises(CatboostError):
        pool = Pool(TRAIN_FILE, column_description=CD_FILE)
        model = CatBoost({"loss_function": "abcdef"})
        model.fit(pool)
Exemplo n.º 30
0
def test_load_file():
    assert _check_shape(Pool(TRAIN_FILE, column_description=CD_FILE))
Exemplo n.º 31
0
def test_pool_cat_features():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    assert np.all(pool.get_cat_feature_indices() == CAT_FEATURES)
Exemplo n.º 32
0
def test_one_doc_feature_importance():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    model = CatBoostClassifier(iterations=5, random_seed=0)
    model.fit(pool)
    np.save(FIMP_PATH, np.array(model.get_feature_importance(np.ones(pool.num_col(), dtype=int), 0, cat_features=pool.get_cat_feature_indices(), fstr_type='Doc')))
    return local_canonical_file(FIMP_PATH)
Exemplo n.º 33
0
    plt.ylabel(labl)
    plt.title("mean " + labl+":  "+ str(round(obj.mean(),1)))
    plt.show()
plt.figure(0)
plot1(rrmse, "RRMSE")
plt.figure(1)
plot1(rmse, "RMSE")
plt.figure(2)
plot1(r1 , "Rsquared")

#ligtht
X_train, Y_train, X_test, Y_test = preprocessing(ap, False, False, select_hr=None)
#catboost
X_train, Y_train, X_test, Y_test = preprocessing(ap, False, False, select_hr=None)
train_pool = Pool(X_train, 
                  Y_train, 
                  cat_features=["hours"])
test_pool = Pool(X_test, 
                 cat_features=["hours"]) 

model = CatBoostRegressor(iterations=300, 
                          depth=6, 
                          learning_rate=0.007, 
                          loss_function='RMSE')
#train the model
model.fit(train_pool)
# make the prediction using the resulting model
preds = model.predict(test_pool)
print(preds)

#1 = np.delete(r1, np.where(r1 ==16)) 
Exemplo n.º 34
0
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat(
                [feature_importance, fold_importance], axis=0)

            bond_scores.append(mean_absolute_error(y_valid, y_pred_valid))
            logger.info('CV mean score: {0:.4f}, std: {1:.4f}.'.format(
                np.mean(bond_scores), np.std(bond_scores)))
            oof[valid_idx] = y_pred_valid.reshape(-1,)
            prediction_type += y_pred
        elif MODEL_TYPE == 'catboost':
            fold_start = timer()
            logger.info('Running Type {} - Fold {} of {}'.format(bond_type,
                                                           fold_count, folds.n_splits))
            X_train, X_valid = X_type.iloc[train_idx], X_type.iloc[valid_idx]
            y_train, y_valid = y_type.iloc[train_idx], y_type.iloc[valid_idx]
            train_dataset = Pool(data=X_train.drop('type', axis=1), label=y_train)
            valid_dataset = Pool(data=X_valid.drop('type', axis=1), label=y_valid)
            test_dataset = Pool(data=X_test_type.drop('type', axis=1))
            DEPTH = 7
            update_tracking(run_id, 'depth', DEPTH)
            model = CatBoostRegressor(iterations=N_ESTIMATORS,
                                         learning_rate=LEARNING_RATE,
                                         depth=DEPTH,
                                         eval_metric=EVAL_METRIC,
                                         verbose=VERBOSE,
                                         random_state = RANDOM_STATE,
                                         thread_count=N_THREADS,
                                         #loss_function=EVAL_METRIC,
                                         task_type = "GPU") # Train on GPU

            model.fit(train_dataset,
Exemplo n.º 35
0
def _get_train_test_pool(dataset):
    train_path, test_path, cd_path = _get_train_test_cd_path(dataset)
    train_pool = Pool(train_path, column_description=cd_path)
    test_pool = Pool(test_path, column_description=cd_path)
    return (train_pool, test_pool)
trained = trained_set.drop('price', axis=1)
trained_price = np.log(trained_set['price'])

X_trained, X_test, y_trained, y_test = train_test_split(trained,
                                                        trained_price,
                                                        test_size=0.33,
                                                        random_state=42,
                                                        shuffle=False)
#X_test stays the same
X_test, X_values, y_test, y_values = train_test_split(X_test,
                                                      y_test,
                                                      test_size=0.33,
                                                      random_state=42,
                                                      shuffle=False)

trained_pool = Pool(X_trained.values, y_trained.values)
test_pool = Pool(X_test.values)
values_pool = Pool(X_values.values, y_values.values)

cbr = CatBoostRegressor(iterations=99,
                        depth=10,
                        learning_rate=0.3,
                        loss_function='RMSE',
                        random_seed=42,
                        eval_metric='RMSE',
                        use_best_model=True)
cbr.fit(trained_pool, eval_set=values_pool, early_stopping_rounds=80)
predictions = cbr.predict(test_pool)
# calculate MAE, MSE, RMSE
print('RMSE: {}'.format(
    math.sqrt(mean_squared_error(y_test.values, predictions))))
Exemplo n.º 37
0
def get_pool(data, labels, cat_f):
    return Pool(data, labels, cat_features=cat_f)
Exemplo n.º 38
0
    c.append(i[0])
data.columns = c

# Use select features
print("Preparing data...")
data = data.loc[:, [
    'suburb', 'rooms', 'type', 'price', 'postcode', 'bathroom', 'car'
]]
data = data.dropna()
X = data.drop(columns=["price"])
y = data["price"]

# Split data for training and evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
categorical = ['suburb', 'type', 'postcode']
train_pool = Pool(X_train, y_train, cat_features=categorical)
test_pool = Pool(X_test, cat_features=categorical)

# Find optimal hyper-parameters
print("Training model...")
model = CatBoostRegressor(loss_function="RMSE", logging_level=None)
grid = {
    'learning_rate': [0.01, 0.03, 0.06, 0.09, 0.12],
    'depth': [4, 6, 8, 10],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'random_strength': [2, 4],
    'bagging_temperature': [0, 1],
}
grid_search_result = model.randomized_search(grid,
                                             X=train_pool,
                                             search_by_train_test_split=True,
Exemplo n.º 39
0
def test_cv_logging():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "Logloss"})
    return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
Exemplo n.º 40
0
##submission
sub_df = pd.DataFrame({"ID_code":test_df["ID_code"].values})
sub_df["target"] = predictions
sub_df.to_csv("lgb_submission.csv", index=False)
## Catboost : https://www.kaggle.com/wakamezake/starter-code-catboost-baseline
from catboost import Pool, CatBoostClassifier
model = CatBoostClassifier(loss_function="Logloss", eval_metric="AUC")
kf = KFold(n_splits=5, random_state=42, shuffle=True)

y_valid_pred = 0 * target
y_test_pred = 0

for idx, (train_index, valid_index) in enumerate(kf.split(train_df)):
    y_train, y_valid = target.iloc[train_index], target.iloc[valid_index]
    X_train, X_valid = train_df[features].iloc[train_index,:], train_df[features].iloc[valid_index,:]
    _train = Pool(X_train, label=y_train)
    _valid = Pool(X_valid, label=y_valid)
    print( "\nFold ", idx)
    fit_model = model.fit(_train,
                          eval_set=_valid,
                          use_best_model=True,
                          verbose=200
                         )
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "  auc = ", roc_auc_score(y_valid, pred) )
    y_valid_pred.iloc[valid_index] = pred
    y_test_pred += fit_model.predict_proba(test_df[features])[:,1]
y_test_pred /= 5
##submission
sub_df1 = pd.DataFrame({"ID_code":test_df["ID_code"].values})
sub_df1["target"] = y_test_pred
Exemplo n.º 41
0
def test_load_ndarray():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    cat_features = pool.get_cat_feature_indices()
    data = np.array(map_cat_features(pool.get_features(), cat_features))
    label = np.array(pool.get_label())
    assert _check_shape(Pool(data, label, cat_features))
Exemplo n.º 42
0
def test_cv_with_not_binarized_target():
    train_file = data_file('adult_not_binarized', 'train_small')
    cd = data_file('adult_not_binarized', 'train.cd')
    pool = Pool(train_file, column_description=cd)
    cv(pool, {"iterations": 5, "random_seed": 0, "loss_function": "Logloss"})
    return local_canonical_file(remove_time_from_json(JSON_LOG_PATH))
Exemplo n.º 43
0
def test_load_ndarray():
    pool = Pool(TRAIN_FILE, column_description=CD_FILE)
    cat_features = pool.get_cat_feature_indices()
    data = np.array(map_cat_features(pool.get_features(), cat_features))
    label = np.array(pool.get_label())
    assert _check_shape(Pool(data, label, cat_features))
# Fit model
# index of ORIGIN col; only categorical
cat_ft_indices = np.where(X_train.dtypes != np.int64)[0]

cb_model.fit(X_train,
             y_train,
             cat_features=cat_ft_indices,
             eval_set=(X_test, y_test),
             plot=True)

# Accuracy & Cross-Validation
cb_accuracy = cb_model.score(X_train, y_train)

# cv
train_pool = Pool(X_train, y_train, cat_ft_indices)

cross_val_paramt = cb_model.get_params()

cross_val_results = cv(pool=train_pool,
                       params=cross_val_paramt,
                       fold_count=10,
                       plot=True)

cb_cross_val_acc_avg = np.mean(cross_val_results['test-MultiClass-mean'])

cb_cross_val_acc_min = np.min(cross_val_results['test-MultiClass-mean'])

cb_cross_val_acc_max = np.max(cross_val_results['test-MultiClass-mean'])

print('Average Cross Validation Score:', round(cb_cross_val_acc_avg * 100,