def test_titanic(): train_df = titanic()[0].fillna(-999) X, y = train_df.drop('Survived', axis=1), train_df.Survived categorical_features_indices = np.where(X.dtypes != np.float)[0] model = CatBoostClassifier(iterations=5) model.fit(X, y, cat_features=categorical_features_indices) preds = model.predict(X)
def build_titanic_dataset(): df_train, df_test = titanic() df_train.fillna(-999, inplace=True) df_test.fillna(-999, inplace=True) X = df_train.drop('Survived', axis=1) # X: train feature y = df_train.Survived # y: train label # if the feature value type != float, then be treat as a categorial feature cate_feat_idx = np.where(X.dtypes != np.float)[0] x_train, x_vali, y_train, y_vali = train_test_split(X, y, train_size=0.75, random_state=42) x_test = df_test return X, y, x_train, x_vali, y_train, y_vali, x_test, cate_feat_idx
def main(**args): titanic_train, _ = titanic() titanic_train.fillna(-999, inplace=True) cols = [ 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked' ] train_sz = int(titanic_train.shape[0] * 0.1) x_train = titanic_train[:train_sz][cols] y_train = titanic_train[:train_sz]['Survived'].astype(int) x_test = titanic_train[train_sz:][cols] y_test = titanic_train[train_sz:]['Survived'].astype(int) try: model = CatBoostClassifier(random_seed=42, **args) model.fit(x_train, y_train, [1, 2, 6, 8, 9], silent=True) accuracy = model.score(x_test, y_test) print(-accuracy) except: print(0)
def load_test(self): _, self.df_test = titanic() self.df_test["Survived"] = -1
def load_train(self): self.df_train, _ = titanic()
from pagi.models.binary_classifiers import get_CatBoostClassifier if __name__ == "__main__": from catboost.datasets import titanic import numpy as np train_df, test_df = titanic() train_df.head() train_df.fillna(-999, inplace=True) test_df.fillna(-999, inplace=True) X = train_df.drop('Survived', axis=1) y = train_df.Survived print(X.dtypes) categorical_features_indices = np.where(X.dtypes != np.float)[0] from sklearn.model_selection import train_test_split X_train, X_validation, y_train, y_validation = train_test_split( X, y, train_size=0.75, random_state=42) X_test = test_df model = get_CatBoostClassifier() model.fit(X_train, y_train, cat_features=categorical_features_indices, eval_set=(X_validation, y_validation), plot=True)
Objective: Catboost algorithm implementation Author: Samuel Adebayo Blog: https://dataaspirant.com Date: 2021-01-02 =============================================== """ ## import the libraries needed import pandas as pd import numpy as np # Here we import our dataset from the CatBoost dataset library from catboost.datasets import titanic ## The titanic dataset is made up of the train and test set, so we have to separate the data titanic_train, titanic_test = titanic() ## Here we create a list to sort the columns so that the "Survived" column comes last ## This is because "Survived" is the target column_sort = [ 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Survived' ] ## Now we apply the sorted columns to the train data train = titanic_train[column_sort] train.set_index('Pclass') ## Not necessary just to get of the default index test = titanic_test train.head()
def _data_loading(self): # The data for this tutorial can be obtained from [this page](https://www.kaggle.com/c/titanic/data) self.train_df, self.test_df = titanic() print(self.train_df.head())
# coding=utf-8 import json import os import numpy as np from catboost import CatBoostClassifier, Pool, cv from sklearn.model_selection import train_test_split from catboost.datasets import titanic # get training data train, test = titanic() # remove nans train, test = train.fillna(-999), test.fillna(-999) # split into train and test X, y = train.drop(['PassengerId', 'Survived'], axis=1), train['Survived'] X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8) # define categorical features cat_indices = np.where(X_train.dtypes != float)[0] # define model params params = { 'iterations': 1000, 'depth': 2, 'loss_function': 'Logloss', 'eval_metric': 'F1', 'use_best_model': True,