def test_wrong_feature_count(): with pytest.raises(CatboostError): data = np.random.rand(100, 10) label = np.random.randint(2, size=100) model = CatBoostClassifier() model.fit(data, label) model.predict(data[:, :-1])
def test_no_cat_in_predict(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(train_pool) pred1 = model.predict(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices())) pred2 = model.predict(Pool(map_cat_features(test_pool.get_features(), train_pool.get_cat_feature_indices()), cat_features=train_pool.get_cat_feature_indices())) assert _check_data(pred1, pred2)
def test_ignored_features(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model1 = CatBoostClassifier(iterations=5, random_seed=0, ignored_features=[1, 2, 3]) model2 = CatBoostClassifier(iterations=5, random_seed=0) model1.fit(train_pool) model2.fit(train_pool) predictions1 = model1.predict(test_pool) predictions2 = model2.predict(test_pool) assert not _check_data(predictions1, predictions2) model1.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_fit_data(): pool = Pool(CLOUDNESS_TRAIN_FILE, column_description=CLOUDNESS_CD_FILE) eval_pool = Pool(CLOUDNESS_TEST_FILE, column_description=CLOUDNESS_CD_FILE) base_model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") base_model.fit(pool) baseline = np.array(base_model.predict(pool, prediction_type='RawFormulaVal')) eval_baseline = np.array(base_model.predict(eval_pool, prediction_type='RawFormulaVal')) eval_pool.set_baseline(eval_baseline) model = CatBoostClassifier(iterations=2, random_seed=0, loss_function="MultiClass") data = map_cat_features(pool.get_features(), pool.get_cat_feature_indices()) model.fit(data, pool.get_label(), pool.get_cat_feature_indices(), sample_weight=np.arange(1, pool.num_row()+1), baseline=baseline, use_best_model=True, eval_set=eval_pool) model.save_model(OUTPUT_MODEL_PATH) return compare_canonical_models(OUTPUT_MODEL_PATH)
def test_custom_objective(): class LoglossObjective(object): def calc_ders_range(self, approxes, targets, weights): assert len(approxes) == len(targets) if weights is not None: assert len(weights) == len(approxes) exponents = [] for index in xrange(len(approxes)): exponents.append(math.exp(approxes[index])) result = [] for index in xrange(len(targets)): p = exponents[index] / (1 + exponents[index]) der1 = (1 - p) if targets[index] > 0.0 else -p der2 = -p * (1 - p) if weights is not None: der1 *= weights[index] der2 *= weights[index] result.append((der1, der2)) return result train_pool = Pool(data=TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(data=TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier( iterations=5, random_seed=0, use_best_model=True, loss_function=LoglossObjective(), eval_metric="Logloss", # Leaf estimation method and gradient iteration are set to match # defaults for Logloss. leaf_estimation_method="Newton", gradient_iterations=10) model.fit(train_pool, eval_set=test_pool) pred1 = model.predict(test_pool, prediction_type='RawFormulaVal') model2 = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, loss_function="Logloss") model2.fit(train_pool, eval_set=test_pool) pred2 = model2.predict(test_pool, prediction_type='RawFormulaVal') for p1, p2 in zip(pred1, pred2): assert abs(p1 - p2) < EPS
def predict(model_path, X_test, is_lgbm=False, is_catboost=False, is_cnn=False, maxlen=400, lgbm_threshold=0.5): """ load the model and predict unseen data """ print('\n === predict === \n') if is_lgbm: # lightgbm model = lgb.Booster(model_file=model_path) elif is_catboost: model = CatBoostClassifier() model = model.load_model(model_path) elif is_cnn: model = load_model(model_path) else: # sklearn # xgboost model = joblib.load(model_path) # y_pred = model.predict_prob(X_test) y_pred = model.predict(X_test) if is_lgbm: #print('==') #print(y_pred) y_output = [] for y in y_pred: if y > lgbm_threshold: y_output.append(1) else: y_output.append(0) #print('==') #print(y_output) return (np.array(y_output)) #return np.array([np.argmax(y) for y in y_pred]) elif is_cnn: # X_test = sequence.pad_sequences(X_test, maxlen=maxlen) y_pred = model.predict(X_test) y_pred = [np.argmax(y) for y in y_pred] return np.array(y_pred) else: return y_pred
def test_raw_predict_equals_to_model_predict(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=10, random_seed=0) model.fit(train_pool, eval_set=test_pool) pred = model.predict(test_pool, prediction_type='RawFormulaVal') assert all(model.get_test_eval() == pred)
def test_adult(): train, test = adult() # CatBoost doesn't support pandas.DataFrame NaNs out of the box for categorical features, and # this dataset has NaNs only for categorical features, so we'll replace them manually with # string "nan" # # seed issue #571 on GitHub or issue MLTOOLS-2785 in internal tracker. # # oh, and don't forget to replace missing values with string "nan" when you are going to apply # the model! train.fillna(value='nan', inplace=True) test.fillna(value='nan', inplace=True) X_train, y_train = train.drop('income', axis=1), train.income X_test, y_test = test.drop('income', axis=1), test.income model = CatBoostClassifier(iterations=5, loss_function='CrossEntropy', class_names=['<=50K', '>50K']) model.fit( X_train, y_train, eval_set=( X_test, y_test, ), cat_features=np.where(X_train.dtypes != np.float)[0], ) predictions = model.predict(X_test)
def test_adult(): train, test = adult() # CatBoost doesn't support pandas.DataFrame NaNs out of the box for categorical features, # so we'll replace them manually with some special string (we'll use "nan") # # seed issue #571 on GitHub or issue MLTOOLS-2785 in internal tracker. # # oh, and don't forget to replace missing values with string "nan" when you are going to apply # the model! for dataset in ( train, test, ): for name in (name for name, dtype in dict(dataset.dtypes).iteritems() if dtype == np.object): dataset[name].fillna('nan', inplace=True) X_train, y_train = train.drop('income', axis=1), train.income X_test, y_test = test.drop('income', axis=1), test.income model = CatBoostClassifier(iterations=5, loss_function='CrossEntropy', class_names=['<=50K', '>50K']) model.fit( X_train, y_train, eval_set=( X_test, y_test, ), cat_features=np.where(X_train.dtypes != np.float)[0], ) predictions = model.predict(X_test)
def rfe_cat(train_x, train_y, valid_x, valid_y, min_): train_pool = Pool(train_x, train_y, cat_features=[0]) valid_pool = Pool(valid_x, valid_y, cat_features=[0]) f1_score_ = [] num_feature = [] feature_name = [] print('Start Recursive Feature Elimination') for i in tqdm_notebook(range(min_, 36), desc='Iterating Feature Elimination'): model = CatBoostClassifier(iterations=50, random_seed=1234, used_ram_limit='10gb') summary = model.select_features( train_pool, eval_set=valid_pool, features_for_select='0-34', num_features_to_select=i, steps=2, algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues, shap_calc_type=EShapCalcType.Regular, train_final_model=True, logging_level='Silent', ) f1_ = f1_score(valid_y, model.predict(valid_pool).tolist(), average='micro') f1_score_.append(f1_) num_feature.append(i) feature_name.append(summary['selected_features_names']) print('Best F-1 score: ', max(f1_score_)) indices = f1_score_.index(max(f1_score_)) print('Best Number feature: ', num_feature[indices]) print('Selected of Feature names: \n', feature_name[indices]) return feature_name[indices]
def train(train_x, train_y, kfold, best_params=None, algorithm_name=None): models = [] acc_results = [] for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)): tr_x = train_x.iloc[tr_idx].reset_index(drop=True) tr_y = train_y.iloc[tr_idx].reset_index(drop=True) val_x = train_x.iloc[val_idx].reset_index(drop=True) val_y = train_y.iloc[val_idx].reset_index(drop=True) model = CatBoostClassifier( iterations=1000, learning_rate=0.1, use_best_model=True, # one_hot_max_size=1000, eval_metric="Accuracy", ) model.fit( tr_x, tr_y, # cat_features=categorical_columns, eval_set=(val_x, val_y), plot=True, ) y_pred = model.predict(val_x) accuracy = accuracy_score(val_y, y_pred) models.append(model) acc_results.append(accuracy) return models, acc_results
class CatBoostWrapper(mlflow.pyfunc.PythonModel): """ MLflow wrapper for CatBoost estimators. """ def load_context(self, context): # pylint: disable=attribute-defined-outside-init with open(context.artifacts['pipeline'], 'rb') as f: self.pipeline = pickle.load(f) with open(context.artifacts['col_config'], 'rb') as f: column_config = pickle.load(f) self.clf = CatBoostClassifier() self.clf.load_model(context.artifacts['cbm_model']) self.col_names = column_config['col_names'] self.preserve_cols = column_config['preserve_neg_vals'] def preprocess(self, data): """ Applies the pre-processing pipeline to the features given in the input dataset. :param data: Input dataset. :return: Transformed dataset. """ data = data[self.col_names] data = remove_inf_values(data) data = remove_negative_values(data, ignore_cols=self.preserve_cols) return self.pipeline.transform(data) def predict(self, context, model_input): X = self.preprocess(model_input) return self.clf.predict(X)
def get_predict_2020(): df_data = pd.read_csv("dvhb_data/test/test 2020/grouped_full.csv", index_col=0) # кодирую слова векторами if os.path.isfile('cult_token.txtdic'): dictionary = corpora.Dictionary.load('cult_token.txtdic') else: df_train_full = my_full_cvs("dvhb_data/train", "train_full.csv") df_train_full_new_names = ['CODE_CULT', 'CODE_GROUP', 'CENTROID', 'YEAR'] df_train_full.columns = df_train_full_new_names text = [df_train_full['CODE_CULT'].tolist()] dictionary = corpora.Dictionary(text) dictionary.save('cult_token.txtdic') # заменяем значения в столбце object_name_n на данные из словаря, а ключи берем из столбца object_type_number df_data['CODE_CULT_2019'] = df_data['CODE_CULT_2019'].map(dictionary.token2id) df_data['CODE_CULT_2018'] = df_data['CODE_CULT_2018'].map(dictionary.token2id) df_data['CODE_CULT_2017'] = df_data['CODE_CULT_2017'].map(dictionary.token2id) df_data['CODE_CULT_2016'] = df_data['CODE_CULT_2016'].map(dictionary.token2id) df_data['CODE_CULT_2015'] = df_data['CODE_CULT_2015'].map(dictionary.token2id) df_data.rename(columns={f'CODE_CULT_{2015 + i}': f'{i + 1}' for i in range(6)}, inplace=True) model = CatBoostClassifier() model.load_model("catboostmodel") predictions_valid = model.predict( df_data[['2', '3', '4', '5', 'LATITUDE', 'LONGTITUDE']].rename(columns={'2': '1', '3': '2', '4': '3', '5': '4'}) ) df_data = df_data.assign(CODE_CULT_2020=predictions_valid) df_data.rename(columns={f'{i + 1}': f'CODE_CULT_{2015 + i}' for i in range(6)}, inplace=True) df_permanent = df_data[ (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2016']) & (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2017']) & (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2018']) & (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2019'])] df_two_year = df_data[ (df_data['CODE_CULT_2015'] == df_data['CODE_CULT_2016']) & (df_data['CODE_CULT_2017'] == df_data['CODE_CULT_2018']) & (df_data['CODE_CULT_2015'] != df_data['CODE_CULT_2018']) & (df_data['CODE_CULT_2019'] != df_data['CODE_CULT_2018']) & ~df_data.index.isin(df_permanent.index)] for row in df_permanent.iterrows(): df_data.loc[row[0]]['CODE_CULT_2020'] = row[1]['CODE_CULT_2015'] for row in df_two_year.iterrows(): df_data.loc[row[0]]['CODE_CULT_2020'] = row[1]['CODE_CULT_2019'] df_data['CODE_CULT_2020'] = df_data['CODE_CULT_2020'].map(dictionary.get) df_data['CODE_CULT_2019'] = df_data['CODE_CULT_2019'].map(dictionary.get) df_data['CODE_CULT_2018'] = df_data['CODE_CULT_2018'].map(dictionary.get) df_data['CODE_CULT_2017'] = df_data['CODE_CULT_2017'].map(dictionary.get) df_data['CODE_CULT_2016'] = df_data['CODE_CULT_2016'].map(dictionary.get) df_data['CODE_CULT_2015'] = df_data['CODE_CULT_2015'].map(dictionary.get) df_data[['CODE_CULT_2015', 'CODE_CULT_2016', 'CODE_CULT_2017', 'CODE_CULT_2018', 'CODE_CULT_2019', 'CODE_CULT_2020', 'LATITUDE', 'LONGTITUDE']].to_csv('predict_2020_full.csv', index=True) df_data['CODE_CULT_2020'].to_csv('predict_2020.csv', index=True)
def trainDecisionTree(self): x = self.dataset.drop(['id', 'radiantClass'], axis=1) y = self.dataset['radiantClass'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=0, stratify=y) model = CatBoostClassifier(iterations=1000, learning_rate=1, depth=2, loss_function='MultiClass', eval_metric='Accuracy') # smote = SMOTE('minority') # x_sm, y_sm = smote.fit_sample(x_train, y_train) model.fit(x_train, y_train) pred = model.predict(x_test) accuracy = accuracy_score(y_test, pred, normalize=True) f1score = f1_score(y_test, pred, average=None) kappa = cohen_kappa_score(y_test, pred) cm = confusion_matrix(y_test, pred) print('Accuracy: ', accuracy) print('F1 score: ', f1score) print(cm) return [accuracy, f1score, kappa, cm, y_test, pred]
def catboost_model(X_train, X_test, y_train, y_test, catboost_params={}, verbose=100, plot=False): learn_pool = Pool( X_train, y_train, cat_features=cat_features, text_features=text_features, feature_names=list(X_train) ) test_pool = Pool( X_test, y_test, cat_features=cat_features, text_features=text_features, feature_names=list(X_train) ) catboost_default_params = { 'iterations': 1000, # 'learning_rate': 0.1, 'eval_metric': 'Accuracy', 'task_type': 'GPU' } catboost_default_params.update(catboost_params) model = CatBoostClassifier(**catboost_default_params) # обучение модели model.fit(learn_pool, eval_set=test_pool, verbose=verbose, plot=plot) prediction = model.predict(X_test) return model
def train_gbm(train_data, train_labels, val_data, val_labels, test_data, test_labels, random_state=42): gbm = CatBoostClassifier(task_type="GPU", logging_level='Silent', loss_function='Logloss', od_type='Iter', od_wait=20, random_state=random_state) eval_pool = Pool(val_data, val_labels) gbm.fit(train_data, train_labels, eval_set=eval_pool, use_best_model=True) gbm.save_model('catboost_1', format="cbm", export_parameters=None, pool=None) pred_probs = gbm.predict_proba(test_data)[:, 1] pred_labels = gbm.predict(test_data) score = [ roc_auc_score(test_labels, pred_probs), f1_score(test_labels, pred_labels) ] print('roc_auc: ', score[0]) print('f1: ', score[1]) average_precision = average_precision_score(pred_labels, test_labels) disp = plot_precision_recall_curve(gbm, test_data, test_labels) disp.ax_.set_title('2-class Precision-Recall curve: ') return gbm
def example_gpu(): from catboost import CatBoostClassifier train_data = [[0, 3], [4, 1], [8, 1], [9, 1]] train_labels = [0, 0, 1, 1] eval_data = [[2, 4], [1, 4], [20, 5], [10, 1]] model = CatBoostClassifier(iterations=1000, task_type="GPU", devices='0:1') model.fit(train_data, train_labels, verbose=False) # Get predictions preds = model.predict(eval_data) print(preds)
def train_meta(train_x, train_y, kfold): models = [] acc_results = [] for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)): tr_x = train_x.iloc[tr_idx].reset_index(drop=True) tr_y = train_y.iloc[tr_idx].reset_index(drop=True) val_x = train_x.iloc[val_idx].reset_index(drop=True) val_y = train_y.iloc[val_idx].reset_index(drop=True) model = CatBoostClassifier( iterations=1000, # iterations=1, learning_rate=0.1, use_best_model=True, eval_metric="Accuracy", verbose=20, ) model.fit( tr_x, tr_y, eval_set=(val_x, val_y), ) y_pred = model.predict(val_x) accuracy = accuracy_score(val_y, y_pred) models.append(model) acc_results.append(accuracy) return models, acc_results
def objective(X, y, trial): """最適化する目的関数""" n_components = (trial.suggest_int("n_components", 1, len(list(X.columns))), ) pca = PCA(n_components=n_components[0]).fit(X) x_pca = pd.DataFrame(pca.transform(X)) print(x_pca, y) acc_results = [] kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=0) for i, (tr_idx, val_idx) in enumerate(kfold.split(x_pca, y)): tr_x = x_pca.iloc[tr_idx].reset_index(drop=True) tr_y = y.iloc[tr_idx].reset_index(drop=True) val_x = x_pca.iloc[val_idx].reset_index(drop=True) val_y = y.iloc[val_idx].reset_index(drop=True) model = CatBoostClassifier( iterations=500, # iterations=1, learning_rate=0.1, use_best_model=True, eval_metric="Accuracy", verbose=20, ) model.fit(tr_x, tr_y, eval_set=(val_x, val_y)) y_pred = model.predict(val_x) acc = accuracy_score(val_y, y_pred) acc_results.append(acc) return sum(acc_results) / len(acc_results) # accuracyの平均値
def train(train_x, train_y, kfold, best_params=None, algorithm_name=None): models = [] acc_results = [] for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)): tr_x = train_x.iloc[tr_idx].reset_index(drop=True) tr_y = train_y.iloc[tr_idx].reset_index(drop=True) val_x = train_x.iloc[val_idx].reset_index(drop=True) val_y = train_y.iloc[val_idx].reset_index(drop=True) model = CatBoostClassifier( # iterations=1, iterations=1000, learning_rate=0.1, use_best_model=True, eval_metric="Accuracy", verbose=20, ) model.fit( tr_x, tr_y, eval_set=(val_x, val_y), plot=True, ) y_pred = model.predict(val_x) accuracy = accuracy_score(val_y, y_pred) if algorithm_name is not None: joblib.dump(model, f"{DATA_DIR}/{algorithm_name}_model_{i}.pkl") models.append(model) acc_results.append(accuracy) return models, acc_results
def tdetect2(no,clf): customer_meter = c_no[no] X,y = ccnc2(no) # clf = XGBClassifier() # clf = SVC(kernel='rbf',probability=True) # clf = LGBMClassifier() clf = CatBoostClassifier(logging_level = "Silent") X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.14, random_state=0) sm = SMOTE(random_state=42) X_res_train, y_res_train = sm.fit_sample(X_train, y_train) X_res_test, y_res_test = sm.fit_sample(X_test, y_test) clf.fit(X_res_train, y_res_train) score = clf.score(X_res_test, y_res_test) #print(Counter(y),Counter(y_train),Counter(y_test),Counter(y_res_train),Counter(y_res_test)) #print("The score for customer :", customer_input, " is ", score) y_pred = clf.predict(X_res_test) probs = clf.predict_proba(X_res_test) preds = probs[:,1] # print(confusion_matrix(y_res_test, y_pred)) tn, fp, fn, tp = confusion_matrix(y_res_test, y_pred).ravel() # print("tn, fp, fn, tp",tn, fp, fn, tp) specificity = tn / (tn+fp) sensitivity = tp/ (tp+fn) fpr = 1 - specificity print ("sensi = %.2f" %sensitivity, "fpr= %.2f" % fpr ) total =sensitivity print("The score for customer :", customer_meter, " is %.2f" % total) # plot_importance(clf,importance_type="weight", ax=plt.gca()) return sensitivity,fpr
def GradientBoost(X_train, X_test, y_train): model = CatBoostClassifier(iterations=10, depth=5) model.fit(X_train, y_train) y_pred = model.predict(X_test) return y_pred
def gbm_predict(data): model = CatBoostClassifier() model.load_model('./models/gbm1.cbm') output = model.predict(data) return output
def score_model(train, test, b_cases, drivers): # train, test, b_cases, drivers = proc_data_train, proc_data_test, b_cases_, sig_feats pred_scores = {} for target_col in list(b_cases): if target_col in ['Authentication', 'None']: continue train_x = deepcopy(StandardScaler().fit_transform( train[drivers[target_col]])) train_y = deepcopy(train[target_col]) test_x = deepcopy(StandardScaler().fit_transform( test[drivers[target_col]])) test_y = deepcopy(test[target_col]) predictor = CatBoostClassifier() predictor.fit(train_x, train_y) predictions = predictor.predict(test_x) predictor.save_model( os.path.join(cur_path, 'modelling', 'models', '%s_classifier.mod' % (target_col.replace('/', '_')))) # pred2 = CatBoostClassifier().load_model(os.path.join(cur_path, 'modelling', 'models', '%s_classifier.mod' % (target_col))) predictions = [i for i, j in zip(predictions, test_y.values) if j == j] pred_scores[target_col] = accuracy_score(test_y.dropna(), predictions) return (pred_scores)
def cross_val(X, y, X_test, param, cat_features, n_splits=3): skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE) acc = [] predict = None for tr_ind, val_ind in skf.split(X, y): X_train = X.iloc[tr_ind] y_train = y.iloc[tr_ind] X_valid = X.iloc[val_ind] y_valid = y.iloc[val_ind] clf = CatBoostClassifier(iterations=500, loss_function = param['loss_function'], depth=param['depth'], l2_leaf_reg = param['l2_leaf_reg'], eval_metric = 'Accuracy', leaf_estimation_iterations = 10, use_best_model=True, logging_level='Silent' ) clf.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_valid, y_valid) ) y_pred = clf.predict(X_valid) accuracy = accuracy_score(y_valid, y_pred) acc.append(accuracy) return sum(acc)/n_splits
def train(): env = my_env.MyEnv(0, realtime_mode=True) model = CatBoostClassifier() model.load_model("catboost_model.model") score = 0.0 print_interval = 1 for n_epi in range(10000): s = env.reset() done = False while not done: y_pred1 = model.predict(s, prediction_type="Probability") if deterministic: y_pred_max = int(np.argmax(y_pred1)) a = action_mapping(y_pred_max) else: a = int(np.random.choice([0, 1, 3, 4, 5], p=y_pred1)) s_prime, r, done, info = env.step(a) s = s_prime score += r if done: break if n_epi%print_interval==0 and n_epi!=0: print("# of episode :{}, avg score : {:.5f}".format(n_epi, score/print_interval)) score = 0.0 env.close()
def get_roc_auc_score(self, generated, real, weights=None): X = np.concatenate((generated, real)) y = np.array([0] * generated.shape[0] + [1] * real.shape[0]) weights = np.concatenate((weights, weights)) ( X_train, X_test, y_train, y_test, w_train, w_test ) = train_test_split( X, y, weights, test_size=0.2, random_state=self.params["seed"], stratify=y, shuffle=True, ) classifier = CatBoostClassifier(iterations=1000, thread_count=10, silent=True) classifier.fit(X_train, y_train) predicted = classifier.predict(X_test) roc_auc = calculate_roc_auc(y_test, predicted, w_test) return roc_auc
def cross_val(X, y, param, cat_features, n_splits=3): skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE) #print('missing value in y_train : {}'.format(sum(y.isna()))) acc = [] predict = None for tr_ind, val_ind in skf.split(X, y): X_train = X[tr_ind] y_train = y[tr_ind] X_valid = X[val_ind] y_valid = y[val_ind] #print('missing value in y_valid : {}'.format(sum(y_valid.isna()))) clf = CatBoostClassifier(iterations=500, loss_function=param['loss_function'], depth=param['depth'], l2_leaf_reg=param['l2_leaf_reg'], eval_metric='Logloss', leaf_estimation_iterations=10, use_best_model=True, logging_level='Silent', thread_count=5, n_estimators=500) clf.fit(X_train, y_train, cat_features=cat_features, eval_set=(X_valid, y_valid)) y_pred = clf.predict(X_valid) accuracy = auc_score(y_valid, y_pred) acc.append(accuracy) return sum(acc) / n_splits
def using_best_param(train, test, label): """ 使用最好的参数训练模型 :param params: :return: """ model = CatBoostClassifier(iterations=1000, learning_rate=0.1, max_depth=7, cat_features=train.columns, verbose=100, custom_metric='F1', random_seed=2019, early_stopping_rounds=200, task_type='CPU', thread_count=11, eval_metric='F1') model.fit(train, label) y_pred = model.predict(test).tolist() judge_df = pd.DataFrame() judge_df['sid'] = range(test.shape[0]) judge_df['label'] = y_pred judge_df['label'] = judge_df['label'].apply(lambda x: 1 if x >= 0.49 else 0) return judge_df[['sid', 'label']]
def param_model_training(self, learning_rate: float, depth: int, trees: int) -> tuple: """ Training a model for a given hyper params Returns: model, model predictions and probs """ X = self.X_train y = self.y_train X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0) clf = CatBoostClassifier( iterations=trees, learning_rate=learning_rate, depth=depth, ) clf.fit(X_train, y_train, cat_features=self.category_features, eval_set=(X_val, y_val), verbose=False) return clf, clf.predict(data=X_val), clf.predict_proba( data=X_val), y_val
def train(train_x, train_y, kfold, best_params=None): models = [] acc_results = [] for i, (tr_idx, val_idx) in enumerate(kfold.split(train_x, train_y)): tr_x = train_x.iloc[tr_idx].reset_index(drop=True) tr_y = train_y.iloc[tr_idx].reset_index(drop=True) val_x = train_x.iloc[val_idx].reset_index(drop=True) val_y = train_y.iloc[val_idx].reset_index(drop=True) model = CatBoostClassifier( iterations=1000, learning_rate=0.1, use_best_model=True, # one_hot_max_size=1000, eval_metric="Accuracy", ) # categorical_columns = [x for x in train_x.columns if train_x[x].dtype == "object"] model.fit( tr_x, tr_y, # cat_features=categorical_columns, eval_set=(val_x, val_y), plot=True, ) y_pred = model.predict(val_x) accuracy = accuracy_score(val_y, y_pred) # # 検証結果の描画 # fig = lgb.plot_metric(evals_result) # plt.savefig(f"{DATA_DIR}/learning_curve_{i+1}.png") models.append(model) acc_results.append(accuracy) return models, acc_results
def test_predict_class(): train_pool = Pool(TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=2, random_seed=0) model.fit(train_pool) pred = model.predict(test_pool, prediction_type="Class") np.save(PREDS_PATH, np.array(pred)) return local_canonical_file(PREDS_PATH)
def test_titanic(): train_df = titanic()[0].fillna(-999) X, y = train_df.drop('Survived', axis=1), train_df.Survived categorical_features_indices = np.where(X.dtypes != np.float)[0] model = CatBoostClassifier(iterations=5) model.fit(X, y, cat_features=categorical_features_indices) preds = model.predict(X)
def test_custom_objective(): class LoglossObjective(object): def calc_ders_range(self, approxes, targets, weights): assert len(approxes) == len(targets) if weights is not None: assert len(weights) == len(approxes) exponents = [] for index in xrange(len(approxes)): exponents.append(math.exp(approxes[index])) result = [] for index in xrange(len(targets)): p = exponents[index] / (1 + exponents[index]) der1 = (1 - p) if targets[index] > 0.0 else -p der2 = -p * (1 - p) if weights is not None: der1 *= weights[index] der2 *= weights[index] result.append((der1, der2)) return result train_pool = Pool(data=TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(data=TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, loss_function=LoglossObjective(), eval_metric="Logloss", # Leaf estimation method and gradient iteration are set to match # defaults for Logloss. leaf_estimation_method="Newton", leaf_estimation_iterations=10) model.fit(train_pool, eval_set=test_pool) pred1 = model.predict(test_pool, prediction_type='RawFormulaVal') model2 = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, loss_function="Logloss") model2.fit(train_pool, eval_set=test_pool) pred2 = model2.predict(test_pool, prediction_type='RawFormulaVal') for p1, p2 in zip(pred1, pred2): assert abs(p1 - p2) < EPS
def test_custom_eval(): class LoglossMetric(object): def get_final_error(self, error, weight): return error / (weight + 1e-38) def is_max_optimal(self): return True def evaluate(self, approxes, target, weight): assert len(approxes) == 1 assert len(target) == len(approxes[0]) approx = approxes[0] error_sum = 0.0 weight_sum = 0.0 for i in xrange(len(approx)): w = 1.0 if weight is None else weight[i] weight_sum += w error_sum += w * (target[i] * approx[i] - math.log(1 + math.exp(approx[i]))) return error_sum, weight_sum train_pool = Pool(data=TRAIN_FILE, column_description=CD_FILE) test_pool = Pool(data=TEST_FILE, column_description=CD_FILE) model = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, eval_metric=LoglossMetric()) model.fit(train_pool, eval_set=test_pool) pred1 = model.predict(test_pool) model2 = CatBoostClassifier(iterations=5, random_seed=0, use_best_model=True, eval_metric="Logloss") model2.fit(train_pool, eval_set=test_pool) pred2 = model2.predict(test_pool) for p1, p2 in zip(pred1, pred2): assert abs(p1 - p2) < EPS
def test_predict_without_fit(): with pytest.raises(CatboostError): pool = Pool(TRAIN_FILE, column_description=CD_FILE) model = CatBoostClassifier() model.predict(pool)