def objective(trial): train_X, val_X, train_y, val_y = train_test_split(self.X, self.y, test_size=0.2) median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') v_train_X = median_imputer.fit_transform(train_X) v_val_X = median_imputer.fit_transform(val_X) train_X = pd.DataFrame(v_train_X, columns=train_X.columns, index=train_X.index) val_X = pd.DataFrame(v_val_X, columns=val_X.columns, index=val_X.index) v_test_X = median_imputer.fit_transform(self.X_validation) test_X = pd.DataFrame(v_test_X, columns=self.X_validation.columns, index=self.X_validation.index) list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000] brf_n_estimators = trial.suggest_categorical( 'n_estimators', list_trees) brf_max_features = trial.suggest_uniform('max_features', 0.15, 1.0) brf_min_samples_split = trial.suggest_int('min_samples_split', 2, 16) brf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16) brf_min_weight_fraction_leaf = trial.suggest_uniform( 'min_weight_fraction_leaf', 0, 0.5) brf_max_depth = trial.suggest_int('max_depth', 2, 32) brfmodel = BalancedRandomForestClassifier( n_estimators=brf_n_estimators, max_features=brf_max_features, min_samples_split=brf_min_samples_split, min_samples_leaf=brf_min_samples_leaf, max_depth=brf_max_depth, min_weight_fraction_leaf=brf_min_weight_fraction_leaf, bootstrap=True) brfmodel.fit(train_X, train_y) aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1]) aucbrf_test = roc_auc_score(self.y_validation, brfmodel.predict_proba(test_X)[:, 1]) print('Accuracy test ' + str( accuracy_score(self.y_validation, brfmodel.predict(test_X)))) plt.figure() plot_confusion_matrix(brfmodel, test_X, self.y_validation, cmap=plt.cm.Blues, normalize=None) plt.show() print(aucbrf_test) return aucbrf
def test_balanced_random_forest_attributes(imbalanced_dataset): X, y = imbalanced_dataset n_estimators = 10 brf = BalancedRandomForestClassifier( n_estimators=n_estimators, random_state=0 ) brf.fit(X, y) for idx in range(n_estimators): X_res, y_res = brf.samplers_[idx].fit_resample(X, y) X_res_2, y_res_2 = ( brf.pipelines_[idx] .named_steps["randomundersampler"] .fit_resample(X, y) ) assert_allclose(X_res, X_res_2) assert_array_equal(y_res, y_res_2) y_pred = brf.estimators_[idx].fit(X_res, y_res).predict(X) y_pred_2 = brf.pipelines_[idx].fit(X, y).predict(X) assert_array_equal(y_pred, y_pred_2) y_pred = brf.estimators_[idx].fit(X_res, y_res).predict_proba(X) y_pred_2 = brf.pipelines_[idx].fit(X, y).predict_proba(X) assert_array_equal(y_pred, y_pred_2)
def test_little_tree_with_small_max_samples(): rng = np.random.RandomState(1) X = rng.randn(10000, 2) y = rng.randn(10000) > 0 # First fit with no restriction on max samples est1 = BalancedRandomForestClassifier( n_estimators=1, random_state=rng, max_samples=None, ) # Second fit with max samples restricted to just 2 est2 = BalancedRandomForestClassifier( n_estimators=1, random_state=rng, max_samples=2, ) est1.fit(X, y) est2.fit(X, y) tree1 = est1.estimators_[0].tree_ tree2 = est2.estimators_[0].tree_ msg = "Tree without `max_samples` restriction should have more nodes" assert tree1.node_count > tree2.node_count, msg
def _train_has_damage(cls, preprocessed_df: pd.DataFrame) -> LinearModelType: X_train, X_test, Y_train, Y_test = cls.get_X_Y_split( preprocessed_df, "has_claim" ) model = BalancedRandomForestClassifier() model.fit(X_train, Y_train) return model
def test_balanced_random_forest_oob(imbalanced_dataset): X, y = imbalanced_dataset X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y) est = BalancedRandomForestClassifier( oob_score=True, random_state=0, n_estimators=1000, min_samples_leaf=2, ) est.fit(X_train, y_train) test_score = est.score(X_test, y_test) assert abs(test_score - est.oob_score_) < 0.1 # Check warning if not enough estimators est = BalancedRandomForestClassifier(oob_score=True, random_state=0, n_estimators=1, bootstrap=True) with pytest.warns(UserWarning) and np.errstate(divide="ignore", invalid="ignore"): est.fit(X, y)
def main(): """ Main entrance.""" print('Spliting challenges') split_challenges() print('Reading X...') X = pd.concat([pd.read_json(XY_PATH['X'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1']) print('Reading y...') y = pd.concat([pd.read_json(XY_PATH['y'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1']) print('\nTraining Inner sampler RFC') for i in range(10): print(f'Training 10-Fold CV #{i}', end='\r') X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i) balanced_rfc = BalancedRandomForestClassifier(n_estimators=100, random_state=0) balanced_rfc.fit(X_train.to_numpy(), y_train.to_numpy().ravel()) pd.DataFrame(balanced_rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'brf', f'y_prob_{i}.json'), orient='records') pd.Series(balanced_rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'brf', f'feature_importance_{i}.json')) print('\nTraining RandomUnderSampler') for i in range(10): print(f'Training 10-Fold CV #{i}', end='\r') X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i) rfc = RandomForestClassifier(n_estimators=100, random_state=0) rus = RandomUnderSampler(random_state=0) X_resample, y_resample = rus.fit_resample(X_train.to_numpy(), y_train.to_numpy().ravel()) rfc.fit(X_resample, y_resample) pd.DataFrame(rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'rus', f'y_prob_{i}.json'), orient='records') pd.Series(rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'rus', f'feature_importance_{i}.json'))
def random_forest(df, drop, target, show, model_name): # split the table into features and outcomes x_cols = [i for i in df.columns if i not in drop] X = df[x_cols] y = df[target] # split features and outcomes into train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0) brf.fit(X_train, y_train) y_predictions = brf.predict(X_test) feature_importance = sorted( zip(brf.feature_importances_, X.columns.tolist()))[::-1] # Calculating the accuracy score. acc_score = balanced_accuracy_score(y_test, y_predictions) # Displaying results if show == True: print(f"Feature Importance: {model_name}") for i in feature_importance: print(i) print("\n") return acc_score * 100
def predict_model_kfold(name,path,features_type,label_name,data): kfold = KFold(10, True) #RandomForest -I 1000 -K 0 -S 1 -num-slots 1 model = BalancedRandomForestClassifier(n_estimators=1000,max_depth=5) index = 0 size = data.shape[0] all_predictions = 0 x = data.drop('hasBug', axis=1) y = data['hasBug'] num_of_bugs = data.loc[data['hasBug'] == 1].shape[0] num_of_all_instances = data.shape[0] bug_precent = float(num_of_bugs) / float(num_of_all_instances) for train, test in kfold.split(data): index += 1 prediction_train = model.fit(x.iloc[train], y.iloc[train]).predict(x.iloc[test]) all_predictions += create_all_eval_results(False,y.iloc[test],prediction_train,name,"training",features_type,num_of_bugs,num_of_all_instances,bug_precent,None) all_predictions /= index start_list = [name,"training",features_type,"sklearn - python"] result_list = start_list+ all_predictions.tolist() global results_all_projects results_all_projects.loc[len(results_all_projects)] = result_list model.fit(x,y) return model
def test_balanced_random_forest(imbalanced_dataset): n_estimators = 10 brf = BalancedRandomForestClassifier(n_estimators=n_estimators, random_state=0) brf.fit(*imbalanced_dataset) assert len(brf.samplers_) == n_estimators assert len(brf.estimators_) == n_estimators assert len(brf.pipelines_) == n_estimators assert len(brf.feature_importances_) == imbalanced_dataset[0].shape[1]
def test_balanced_random_forest_pruning(imbalanced_dataset): brf = BalancedRandomForestClassifier() brf.fit(*imbalanced_dataset) n_nodes_no_pruning = brf.estimators_[0].tree_.node_count brf_pruned = BalancedRandomForestClassifier(ccp_alpha=0.015) brf_pruned.fit(*imbalanced_dataset) n_nodes_pruning = brf_pruned.estimators_[0].tree_.node_count assert n_nodes_no_pruning > n_nodes_pruning
def evaluate_model(self): with open(self.result_folder + '/param_RF_{}.json'.format(self.epoch)) as f: dati = json.load(f) for data in dati: del data['value'] rf_model = BalancedRandomForestClassifier(**data) rf_auc = [] for i in tqdm(range(20)): cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i + 187462) for train_index, test_index in cv.split(self.X, self.y): trainX = self.X.iloc[lambda x: train_index] testX = self.X.iloc[lambda x: test_index] trainy = np.take(self.y, train_index) testy = np.take(self.y, test_index) median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') imputer = median_imputer.fit(trainX) vtrainX = imputer.transform(trainX) imputertest = median_imputer.fit(testX) vtestX = imputertest.transform(testX) trainX = pd.DataFrame(vtrainX, columns=trainX.columns, index=trainX.index) testX = pd.DataFrame(vtestX, columns=testX.columns, index=testX.index) # Calcolo AUC per migliori risultati da CatBoost rf_model.fit(trainX, trainy) roc_rf = roc_auc_score( testy, rf_model.predict_proba(testX)[:, 1]) rf_auc.append(roc_rf) print(roc_rf) print(statistics.mean(rf_auc)) return rf_auc
def _plot_championship_importance(all_res, save_directory, top = 6): save_file = save_directory + 'championship_importance.png' if os.path.exists(save_file): return xs = [] ys = [] teams = [] for season in all_res: team_df = all_res[season][0] team_stats = all_res[season][1] champion = all_res[season][2] for team, g in team_df.groupby('TEAM'): x = g.nlargest(top, 'TIME')[['off_norm', 'def_norm']].unstack().values y = 1 if team in champion else 0 xs.append(x) ys.append(y) teams.append(team + '_' + season) xs = np.vstack(xs) ys = np.array(ys) fts = [] for ntree in tqdm([50, 75, 100, 125, 150, 175, 200]): for i in np.where(ys==1)[0]: xs_temp = xs[[x for x in range(len(xs)) if x != i]] ys_temp = ys[[y for y in range(len(xs)) if y != i]] rfr = BalancedRandomForestClassifier(n_estimators=ntree) rfr.fit(xs_temp, ys_temp) ft = rfr.feature_importances_ fts.append(ft) fts = np.vstack(fts) feature_names = ['off' + str(i+1) for i in range(top)] + ['def' + str(i+1) for i in range(top)] fig, ax = plt.subplots(figsize=(8,6)) for i in range(len(feature_names)): ax.boxplot(fts[:, i], positions=[i]) ax.set_xticklabels(feature_names) ax.set_ylabel('Feature Importance', labelpad=10) ax.set_title('Championship Feature Importance') plt.savefig(save_file) plt.close()
def test_balanced_random_forest_oob_binomial(ratio): # Regression test for #655: check that the oob score is closed to 0.5 # a binomial experiment. rng = np.random.RandomState(42) n_samples = 1000 X = np.arange(n_samples).reshape(-1, 1) y = rng.binomial(1, ratio, size=n_samples) erf = BalancedRandomForestClassifier(oob_score=True, random_state=42) erf.fit(X, y) assert np.abs(erf.oob_score_ - 0.5) < 0.1
def evaluate_on_validation_or_test(self, test=False): with open(self.result_folder + '/param_RF_{}.json'.format(self.epoch)) as f: dati = json.load(f) for data in dati: del data['value'] rf_model = BalancedRandomForestClassifier(**data) trainX = self.X trainy = self.y valx = self.X_validation valy = self.y_validation if test == True: testx = self.X_test testy = self.y_test median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') imputer = median_imputer.fit(trainX) vtrainX = imputer.transform(trainX) trainX = pd.DataFrame(vtrainX, columns=trainX.columns, index=trainX.index) vvalX = imputer.transform(valx) valx = pd.DataFrame(vvalX, columns=valx.columns, index=valx.index) if test == True: vtest = imputer.transform(testx) testx = pd.DataFrame(vtest, columns=testx.columns, index=testx.index) trainX = pd.concat([trainX, valx]) trainy = np.concatenate((trainy, valy)) rf_model.fit(trainX, trainy) if test == True: roc_rf = roc_auc_score(testy, rf_model.predict_proba(testx)[:, 1]) else: roc_rf = roc_auc_score(valy, rf_model.predict_proba(valx)[:, 1]) if test == False: print("Validation AUC: {}".format(str(roc_rf))) else: print("Test AUC: {}".format(str(roc_rf)))
def run_best_estimator(self, train_x, train_y, test_x, test_y, estimator, params, clf_type, question): estimator_scores = {} if estimator == 'BalancedRandomForestClassifier': clf = BalancedRandomForestClassifier( n_estimators=params['n_estimators'], sampling_strategy=params['sampling_strategy'], random_state=42) elif estimator == 'BalancedBaggingClassifier': clf = BalancedBaggingClassifier( n_estimators=params['n_estimators'], bootstrap=params['bootstrap'], max_samples=params['max_samples'], sampling_strategy=params['sampling_strategy'], random_state=42) elif estimator == 'EasyEnsembleClassifier': clf = EasyEnsembleClassifier( n_estimators=params['n_estimators'], sampling_strategy=params['sampling_strategy'], random_state=42) clf.fit(train_x, train_y) cross_val_scores = self.calc_cross_val_scores(clf, train_x, train_y, clf_type, question) predicted_labels = clf.predict(test_x) tn, fp, fn, tp = confusion_matrix(test_y, predicted_labels).ravel() specificity = round((tn / (tn + fp)) * 100, 2) predicted_prob = clf.predict_proba(test_x) predicted_prob_true = [p[1] for p in predicted_prob] estimator_scores['Question'] = question estimator_scores['Accuracy'] = round( accuracy_score(test_y, predicted_labels) * 100, 2) estimator_scores['Balanced Accuracy'] = round( balanced_accuracy_score(test_y, predicted_labels) * 100, 2) estimator_scores['Precision'] = round( precision_score(test_y, predicted_labels) * 100, 2) estimator_scores['Recall'] = round( recall_score(test_y, predicted_labels) * 100, 2) estimator_scores['Specificity'] = specificity estimator_scores['F1'] = round(f1_score(test_y, predicted_labels), 2) estimator_scores['ROC AUC'] = round( roc_auc_score(test_y, predicted_prob_true), 2) # print('Perfect Confusion Matrix for Q-%s is: ' % (str(question).zfill(2))) # perfect_labels = train_y # print(confusion_matrix(train_y, perfect_labels)) return cross_val_scores, estimator_scores
class BaselineRandomForest(BaseClassifier): def __init__(self): self.random_forest_classifier = RandomForestClassifier( n_estimators=500, max_features='auto', max_depth=None, n_jobs=1, class_weight=None, criterion='entropy', min_samples_split=2, min_samples_leaf=1) self.feature_preprocessor = FeaturePreprocessor() self.feature_list = None self.model_filename = 'baseline_rf.pkl' def fit(self, samples: pd.DataFrame, labels: pd.DataFrame): samples = self.feature_preprocessor.preprocess_features(samples) samples = self.feature_preprocessor.remove_duplicates(samples) # intersect samples and labels samples, labels = intersect_oids_in_dataframes(samples, labels) self.feature_list = samples.columns samples_np_array = samples.values labels_np_array = labels['classALeRCE'].loc[samples.index].values self.random_forest_classifier.fit(samples_np_array, labels_np_array) def predict_proba(self, samples: pd.DataFrame) -> pd.DataFrame: samples = self.feature_preprocessor.preprocess_features(samples) samples_np_array = samples[self.feature_list].values predicted_probs = self.random_forest_classifier.predict_proba( samples_np_array) predicted_probs_df = pd.DataFrame(predicted_probs, columns=self.get_list_of_classes(), index=samples.index.values) predicted_probs_df.index.name = 'oid' return predicted_probs_df def get_list_of_classes(self) -> list: return self.random_forest_classifier.classes_ def save_model(self, directory: str) -> None: with open(os.path.join(directory, self.model_filename), 'wb') as f: pickle.dump(self.random_forest_classifier, f, pickle.HIGHEST_PROTOCOL) with open(os.path.join(directory, 'feature_list.pkl'), 'wb') as f: pickle.dump(self.feature_list, f, pickle.HIGHEST_PROTOCOL) def load_model(self, directory: str) -> None: rf = pd.read_pickle(os.path.join(directory, self.model_filename)) self.random_forest_classifier = rf self.feature_list = pd.read_pickle( os.path.join(directory, 'feature_list.pkl'))
def test_balanced_random_forest_oob(imbalanced_dataset): X, y = imbalanced_dataset est = BalancedRandomForestClassifier(oob_score=True, random_state=0) n_samples = X.shape[0] est.fit(X[:n_samples // 2, :], y[:n_samples // 2]) test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:]) assert abs(test_score - est.oob_score_) < 0.1 # Check warning if not enough estimators est = BalancedRandomForestClassifier(oob_score=True, random_state=0, n_estimators=1, bootstrap=True) with pytest.warns(UserWarning) and np.errstate(divide="ignore", invalid="ignore"): est.fit(X, y)
def random_forest(X_train, y_train, X_test, y_test, X_train_res, y_train_res): rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) rf.fit(X_train, y_train.values.ravel()) y_train_rf = rf.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_rf) without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Random Forest (niezbalansowany): {}%".format(without)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) rf_oversampling = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) rf_oversampling.fit(X_train_res, y_train_res.ravel()) y_train_rf = rf_oversampling.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_rf) with_oversampling=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Random Forest (z oversamplingiem): {}%".format(without)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) brf.fit(X_train, y_train.values.ravel()) y_train_brf = brf.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_brf) within=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Random Forest (zbalansowany - undersampling): {}%".format(within)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) print(brf.feature_importances_) objects = ('country','gender', 'age', 'visiting Wuhan', 'from Wuhan') y_pos = np.arange(len(objects)) performance = brf.feature_importances_*100 plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Procent zależności') plt.title('Zależność poszczególnych atrybutów') plt.show() objects = ('Random Forest niezbalansowany','Random Forest z oversamplingiem', 'Random Forest zbalansowany') y_pos = np.arange(len(objects)) performance = [without, with_oversampling, within] plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Procent dokładności') plt.title('Dokładność Random Forest') plt.show() return without, within
def evaluate(X_train, y_train, X_test, y_test): global seed clf = BalancedRandomForestClassifier(n_estimators=500, random_state=seed) clf = clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test).argsort(axis=1) y_pred1 = y_pred[:, -1] y_pred2 = y_pred[:, -2] return metrics.confusion_matrix(y_test, y_pred1), metrics.confusion_matrix( y_test, y_pred2)
def balanced_random_forest(train_features, train_labels, test_features, feature_list=None, hfo_type_name=None): rf = BalancedRandomForestClassifier( random_state=32, n_jobs=-1, # use all available processors # class_weight='balanced_subsample' ) rf.fit(train_features, train_labels) # Predict over test rf_predictions = rf.predict(test_features) rf_probs = rf.predict_proba(test_features)[:, 1] # IF FEATURE IMPORTANCE FIGS NOT EXISTS # print_feature_importances(rf, feature_list) # graphics.feature_importances(feature_list, rf.feature_importances_, hfo_type_name) return rf_predictions, rf_probs, rf
def apply_balanced_RF_classifier(X_train, y_train, model_path): ''' Args: X_train dataframe with all the features to be used for training y_train series containing labels for each row of X_train model_path path where trained balanced random forest model is to be saved Output: trained balanced random forest model ''' BRF_model = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) # Fit the training data BRF_model.fit(X_train, y_train) pickle_models(BRF_model, model_path) return BRF_model
def test_balanced_random_forest_error_warning_warm_start(imbalanced_dataset): brf = BalancedRandomForestClassifier(n_estimators=5) brf.fit(*imbalanced_dataset) with pytest.raises(ValueError, message="must be larger or equal to"): brf.set_params(warm_start=True, n_estimators=2) brf.fit(*imbalanced_dataset) brf.set_params(n_estimators=10) brf.fit(*imbalanced_dataset) with pytest.warns(UserWarning, match="Warm-start fitting without"): brf.fit(*imbalanced_dataset)
def test_balanced_random_forest_error_warning_warm_start(imbalanced_dataset): brf = BalancedRandomForestClassifier(n_estimators=5) brf.fit(*imbalanced_dataset) with pytest.raises(ValueError, match="must be larger or equal to"): brf.set_params(warm_start=True, n_estimators=2) brf.fit(*imbalanced_dataset) brf.set_params(n_estimators=10) brf.fit(*imbalanced_dataset) with pytest.warns(UserWarning, match="Warm-start fitting without"): brf.fit(*imbalanced_dataset)
def fit(self, X, Y, sample_weight=None): from imblearn.ensemble import BalancedRandomForestClassifier estimator = BalancedRandomForestClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_features=self.max_features, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, min_weight_fraction_leaf=self.min_weight_fraction_leaf, bootstrap=self.bootstrap, min_impurity_decrease=self.min_impurity_decrease, random_state=self.random_state, n_jobs=self.n_jobs, class_weight=self.class_weight, sampling_strategy=self.sampling_strategy, replacement=self.replacement) estimator.fit(X, Y) self.estimator = estimator return self
def test_balanced_random_forest_attributes(imbalanced_dataset): X, y = imbalanced_dataset n_estimators = 10 brf = BalancedRandomForestClassifier(n_estimators=n_estimators, random_state=0) brf.fit(X, y) for idx in range(n_estimators): X_res, y_res = brf.samplers_[idx].fit_resample(X, y) X_res_2, y_res_2 = brf.pipelines_[idx].named_steps[ 'randomundersampler'].fit_resample(X, y) assert_allclose(X_res, X_res_2) assert_array_equal(y_res, y_res_2) y_pred = brf.estimators_[idx].fit(X_res, y_res).predict(X) y_pred_2 = brf.pipelines_[idx].fit(X, y).predict(X) assert_array_equal(y_pred, y_pred_2) y_pred = brf.estimators_[idx].fit(X_res, y_res).predict_proba(X) y_pred_2 = brf.pipelines_[idx].fit(X, y).predict_proba(X) assert_array_equal(y_pred, y_pred_2)
def objective(trial): train_X, val_X, train_y, val_y = self.df_train_media.loc[:, self. df_train_media . columns != '41'].values, self.df_validation_media.loc[:, self.df_validation_media.columns != '41'].values, self.df_train_media[ '41'].values, self.df_validation_media[ '41'].values test_X, test_y = self.df_test_media.loc[:, self.df_test_media. columns != '41'].values, self.df_test_media[ '41'].values list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000] n_estimators = trial.suggest_categorical('n_estimators', list_trees) max_features = trial.suggest_uniform('max_features', 0.15, 1.0) min_samples_split = trial.suggest_int('min_samples_split', 2, 16) min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16) min_weight_fraction_leaf = trial.suggest_uniform( 'min_weight_fraction_leaf', 0, 0.5) max_depth = trial.suggest_int('max_depth', 2, 32) brfmodel = BalancedRandomForestClassifier( n_estimators=n_estimators, max_features=max_features, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_depth=max_depth, min_weight_fraction_leaf=min_weight_fraction_leaf, bootstrap=True) brfmodel.fit(train_X, train_y) aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1]) print( "Test AUC: " + str(roc_auc_score(test_y, brfmodel.predict_proba(test_X)[:, 1]))) return aucbrf
def train_model(data): dataset = pd.get_dummies( data, columns=['Employment.Type', 'Driving_flag', 'Bureau_bin'], drop_first=True) #dataset = pd.get_dummies(data,columns=['Employment.Type','Driving_flag'],drop_first=True) X = dataset.drop('loan_default', axis=1) y = dataset['loan_default'] #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,train_size=.8, stratify=y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y) rfc = RandomForestClassifier(class_weight='balanced', n_estimators=100) rfc.fit(X_train, y_train) lr = LogisticRegression(class_weight='balanced') lr.fit(X_train, y_train) xgb = XGBClassifier(scale_pos_weight=3.4) xgb.fit(X_train, y_train) brfc = BalancedRandomForestClassifier(max_depth=4, random_state=0) brfc.fit(X_train, y_train) bbc = BalancedBaggingClassifier(n_estimators=100, random_state=42) bbc.fit(X_train, y_train) models = [rfc, lr, xgb, brfc, bbc] model_names = [ 'RandomForestClassifier', 'LogisticRegression', 'XGBClassifier', 'BalancedRandomForestClassifier', 'BalancedBaggingClassifier' ] for m, n in zip(models, model_names): print('Classifier: ' + n) predict_evaluate_classifier(X_test, y_test, m) return rfc, lr, xgb, brfc, bbc
def test_balanced_random_forest_error(imbalanced_dataset, forest_params, err_msg): brf = BalancedRandomForestClassifier(**forest_params) with pytest.raises(ValueError, match=err_msg): brf.fit(*imbalanced_dataset)
plot_confusion_matrix(cm_balanced_bagging, classes=np.unique(satimage.target), ax=ax[1], title='Balanced bagging') ############################################################################### # Classification using random forest classifier with and without sampling ############################################################################### # Random forest is another popular ensemble method and it is usually # outperforming bagging. Here, we used a vanilla random forest and its balanced # counterpart in which each bootstrap sample is balanced. rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) rf.fit(X_train, y_train) brf.fit(X_train, y_train) y_pred_rf = rf.predict(X_test) y_pred_brf = brf.predict(X_test) # Similarly to the previous experiment, the balanced classifier outperform the # classifier which learn from imbalanced bootstrap samples. In addition, random # forest outsperforms the bagging classifier. print('Random Forest classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_rf), geometric_mean_score(y_test, y_pred_rf))) cm_rf = confusion_matrix(y_test, y_pred_rf) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_rf, classes=np.unique(satimage.target), ax=ax[0],
else: finite_idx = np.where(np.isfinite(column))[0] x = vectors[finite_idx, :] y = column[finite_idx] if y.sum() == 0 or y.sum() == len(y): print("%15s: undefined" % (name)) continue train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, stratify=y) if args.brf: rf = BalancedRandomForestClassifier(n_estimators=100, n_jobs=4) else: rf = RandomForestClassifier(n_estimators=100, n_jobs=4) rf.fit(train_x, train_y) p_te = rf.predict_proba(test_x) auc_te = roc_auc_score(test_y, p_te[:, 1]) bacc = balanced_accuracy_score(test_y, p_te[:, 1].round(0)) print("%15s: %3.5f %3.5f" % (name, auc_te, bacc)) bacc_av += bacc auc_av += auc_te if not (args.save is None): gzpickle(args.save + '_%i.pkz' % i, rf) print('Averages:') print('AUC: %8.3f BAcc: %8.3f' % (auc_av / (i + 1), bacc_av / (i + 1)))
def test_balanced_random_forest_sample_weight(imbalanced_dataset): rng = np.random.RandomState(42) X, y = imbalanced_dataset sample_weight = rng.rand(y.shape[0]) brf = BalancedRandomForestClassifier(n_estimators=5, random_state=0) brf.fit(X, y, sample_weight)
# %% [markdown] ''' ## Train a random forest classifier *Note: this may take a while* ''' # %% clf = BalancedRandomForestClassifier(n_estimators=2000, replacement=True, sampling_strategy='not minority', n_jobs=4, random_state=42, verbose=1) clf.fit(X_train, Y_train) Y_test_pred = clf.predict(X_test) print('\nClassifier performance') print('Out of sample:\n', metrics.classification_report(Y_test, Y_test_pred, zero_division=0)) # %% [markdown] ''' ## Robustness to unforseen scenarios What if the subjects in the test set wore the device differently from those in the training set? For example, suppose that all the subjects in the training set were right-handed, but the test subjects are left-handed. This would more or less result in the device being rotated.
def test_balanced_random_forest_error(imbalanced_dataset, forest_params, err_msg): brf = BalancedRandomForestClassifier(**forest_params) with pytest.raises(ValueError, message=err_msg): brf.fit(*imbalanced_dataset)
def Clasificar(database, new, path): pd.options.mode.chained_assignment = None if 'Response by Category' in list(database.columns): database = database.drop(['Response by Category','Response by Description'], axis = 1) database = database.sample(frac= 0.4, replace = False) #Chequeo las companias que ya estaban clasificadas #d = new.merge(database, how ='left', left_on='Organization Name', right_on = 'Investee')[['Investee','Category.1','Area of Focus']] #new = new.merge(d, how = "left", left_on = "Organization Name", right_on = "Investee") #new = new.drop(columns=["Investee"]) database["Category.1"] = database["Category.1"].replace("rejected", "Rejected") database["Category.1"] = database["Category.1"].replace("B2C ", "B2C") database["Category.1"] = database["Category.1"].replace("FIntech", "Fintech") database['Prediction'] = np.nan new['Prediction'] = np.nan new = new.drop(['Prediction'], axis=1) #CLASIFICADOR warnings.filterwarnings('ignore') print('Importando bases de datos') new = new.rename(columns = {'Categories':'Category','Organization Name':'Investee'}) train = database[['Operation','Investee', 'Category', 'Description', 'Category.1', 'Area of Focus']].dropna() newdata = new[['Transaction Name','Investee', 'Category', 'Description']] print('Preprocesamiento del texto') stop_words = stopwords.words('english') for column in ['Category','Description']: train[column] = train[column].apply(lambda x: (" ".join(str(x).lower() for x in str(x).split())).encode('utf-8').decode('utf-8')) # lower case train[column] = train[column].str.replace('[^\w\s]', ' ') # removing punctuation train[column] = train[column].apply(lambda x: " ".join(str(x) for x in str(x).split() if x not in stop_words)) # removing stop words newdata[column] = newdata[column].apply(lambda x: (" ".join(x.lower() for x in str(x).split()))) # lower case newdata[column] = newdata[column].str.replace('[^\w\s]', ' ') # removing punctuation newdata[column] = newdata[column].apply(lambda x: " ".join(str(x) for x in str(x).split() if x not in stop_words)) # removing stop words train_src1 = train[['Category','Description','Category.1']] train_src1['Rejected?'] = 0 train_src1.loc[train_src1['Category.1'] != 'Rejected', 'Rejected?'] = 1 new_src1 = newdata[['Category','Description']] #new_src1['Rejected?'] = 0 #new_src1.loc[new_src1['Category.1'] != 'Rejected', 'Rejected?'] = 1 #Binarizacion vectorizer = CountVectorizer() vectorI = pd.DataFrame(vectorizer.fit_transform(train_src1['Category']).toarray()) vectorI_new = pd.DataFrame(vectorizer.transform(new_src1['Category']).toarray()) vectorIdes = pd.DataFrame(vectorizer.fit_transform(train_src1['Description']).toarray()) vectorIdes_new = pd.DataFrame(vectorizer.transform(new_src1['Description']).toarray()) vectorI = pd.concat([vectorI, vectorIdes], axis = 1) vectorI_new = pd.concat([vectorI_new, vectorIdes_new], axis = 1) print('Entrenamiento') #Clasificacion binaria: Rechazadas vs no rechazadas #Resampling + Random Forest brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0) brf.fit(vectorI, train_src1['Rejected?']) y_train_pred = brf.predict(vectorI) print('Confusion matrix: \n' , confusion_matrix(train_src1['Rejected?'], y_train_pred)) print('Accuracy: \n' , accuracy_score(train_src1['Rejected?'], y_train_pred)) print('Recall: \n' , recall_score(train_src1['Rejected?'], y_train_pred)) print('Clasificacion y exportacion') #Ajustando modelo a nuevos datos y_new_predict = brf.predict(vectorI_new) y_new_predict_proba = brf.predict_proba(vectorI_new) newdata['Prediction'] = y_new_predict newdata['Prob. of being rejected'] = y_new_predict_proba[:,0] newdata['Prob. of being of interest'] = y_new_predict_proba[:,1] #Creamos archivo Companies y exportamos new = pd.concat([new, newdata[['Prediction','Prob. of being rejected','Prob. of being of interest']]], axis=1, sort=False) return new
## Train a random forest classifier *Note: this may take a while* ''' # %% clf = BalancedRandomForestClassifier( n_estimators=2000, replacement=True, sampling_strategy='not minority', oob_score=True, n_jobs=4, random_state=42, verbose=1 ) clf.fit(X_train, Y_train) Y_test_pred = clf.predict(X_test) print('\nClassifier performance') print('Out of sample:\n', metrics.classification_report(Y_test, Y_test_pred, zero_division=0)) # This will be the training set Y_in_train = clf.oob_decision_function_.astype('float32') # This will be the test set Y_in_test = clf.predict_proba(X_test).astype('float32') # %% [markdown] ''' ## Architecture design As a baseline, let's use a single-layer bidirectional LSTM.