def randomised_search(self): print_to_consol('Running randomized search to find best classifier') #create the decision forest clf1 = DecisionTreeClassifier(random_state=20, class_weight='balanced', max_features=self.numf) logging.info(f'Initialised classifier') #set up randomized search param_dict = { 'criterion': ['gini', 'entropy'], 'min_samples_split': randint(2, 20), 'max_depth': randint(1, 10), 'min_samples_leaf': randint(1, 20), 'max_leaf_nodes': randint(10, 20) } logging.info( f'Following parameters will be explored in randomized search \n' f'{param_dict} \n') #building and running the randomized search rand_search = RandomizedSearchCV(clf1, param_dict, random_state=5, cv=self.cv, n_iter=self.numc, scoring='accuracy', n_jobs=-1) rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train) best_parameters = rand_search_fitted.best_params_ best_scores = rand_search_fitted.best_score_ logging.info( f'Running randomised search for best patameters of classifier \n' f'Best parameters found: {best_parameters} \n' f'Best accuracy scores found: {best_scores} \n') self.model = rand_search_fitted.best_estimator_ datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') joblib.dump( self.model, os.path.join(self.directory, 'best_predictor_' + datestring + '.pkl')) logging.info(f'Writing best classifier to disk in {self.directory} \n') print_to_consol( 'Getting 95% confidence interval for uncalibrated classifier') alpha, upper, lower = get_confidence_interval( self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test, self.model, self.directory, self.bootiter, 'uncalibrated') logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n' f'for uncalibrated classifier. \n') print_to_consol('Getting feature importances for best classifier') best_clf_feat_import = self.model.feature_importances_ best_clf_feat_import_sorted = sorted(zip(best_clf_feat_import, self.X_train_scaled.columns), reverse=True) logging.info( f'Feature importances for best classifier {best_clf_feat_import_sorted} \n' ) print_to_consol('Plotting feature importances for best classifier') feature_importances_best_estimator(best_clf_feat_import_sorted, self.directory) logging.info( f'Plotting feature importances for best classifier in decreasing order \n' )
def randomised_search(self): print('*' * 80) print('* Running randomized search to find best classifier') print('*' * 80) #create the decision forest clf1 = DecisionTreeClassifier(random_state=20, class_weight='balanced', max_features=self.numf) ada = AdaBoostClassifier(base_estimator=clf1, algorithm="SAMME.R", random_state=55) logging.info( f'Initialised decision tree and AdaBoost using balanced class weights' ) #set up randomized search param_dict = { 'base_estimator__criterion': ['gini', 'entropy'], 'n_estimators': randint(100, 10000), #number of base estimators to use 'learning_rate': uniform(0.0001, 1.0), 'base_estimator__min_samples_split': randint(2, 20), 'base_estimator__max_depth': randint(1, 10), 'base_estimator__min_samples_leaf': randint(1, 20), 'base_estimator__max_leaf_nodes': randint(10, 20) } logging.info( f'Following parameters will be explored in randomized search \n' f'{param_dict}') #building and running the randomized search rand_search = RandomizedSearchCV(ada, param_dict, random_state=5, cv=self.cv, n_iter=self.numc, scoring='accuracy', n_jobs=-1) rand_search_fitted = rand_search.fit(self.X_train, self.y_train) best_parameters = rand_search_fitted.best_params_ best_scores = rand_search_fitted.best_score_ logging.info( f'Running randomised search for best patameters of a decision tree \n' f'with AdaBoost scoring is accuracy \n' f'Best parameters found: {best_parameters} \n' f'Best accuracy scores found: {best_scores} \n') self.model = rand_search_fitted.best_estimator_ datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') joblib.dump( self.model, os.path.join(self.directory, 'best_predictor_' + datestring + '.pkl')) logging.info(f'Writing best classifier to disk in {self.directory} \n') print('*' * 80) print('* Getting 95%% confidence interval for best classifier') print('*' * 80) alpha, upper, lower = get_confidence_interval( self.X_train, self.y_train, self.X_test, self.y_test, self.model, self.directory, self.bootiter, 'uncalibrated') logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n') print('*' * 80) print('* Getting feature importances for best classifier') print('*' * 80) best_clf_feat_import = self.model.feature_importances_ best_clf_feat_import_sorted = sorted(zip(best_clf_feat_import, self.X_train.columns), reverse=True) logging.info( f'Feature importances for best classifier {best_clf_feat_import_sorted} \n' ) all_clf_feat_import_mean = np.mean( [tree.feature_importances_ for tree in self.model.estimators_], axis=0) all_clf_feat_import_mean_sorted = sorted(zip(all_clf_feat_import_mean, self.X_train.columns), reverse=True) print('*' * 80) print('* Plotting feature importances across all trees') print('*' * 80) feature_importances_best_estimator(best_clf_feat_import_sorted, self.directory) logging.info( f'Plotting feature importances for best classifier in decreasing order \n' ) feature_importances_error_bars(self.model, self.X_train.columns, self.directory) logging.info( f'Plotting feature importances for best classifier with errorbars \n' )
def randomised_search(self): print_to_consol('Running randomized search to find best classifier') #create the decision forest clf1 = KNeighborsClassifier() logging.info(f'Initialised classifier \n') #set up randomized search param_dict = { 'n_neighbors': randint(2, 10), 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size': randint(2, 50) } logging.info( f'Following parameters will be explored in randomized search \n' f'{param_dict} \n') #building and running the randomized search rand_search = RandomizedSearchCV(clf1, param_dict, random_state=5, cv=self.cv, n_iter=self.numc, scoring='accuracy', n_jobs=-1) rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train) best_parameters = rand_search_fitted.best_params_ best_scores = rand_search_fitted.best_score_ logging.info( f'Running randomised search for best patameters of classifier \n' f'Best parameters found: {best_parameters} \n' f'Best accuracy scores found: {best_scores} \n') self.model = rand_search_fitted.best_estimator_ datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') joblib.dump( self.model, os.path.join(self.directory, 'best_predictor_' + datestring + '.pkl')) logging.info(f'Writing best classifier to disk in {self.directory} \n') print_to_consol( 'Getting 95% confidence interval for uncalibrated classifier') alpha, upper, lower = get_confidence_interval( self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test, self.model, self.directory, self.bootiter, 'uncalibrated') logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n' f'for uncalibrated classifier. \n') print_to_consol('Getting feature importances for best classifier') feature_list = [] for i in range(len(self.X_train_scaled.columns)): X = self.X_train_scaled.iloc[:, i].values.reshape(-1, 1) scores = cross_val_score(self.model, self.X_train_scaled, self.y_train, cv=self.cv) feature_list.append(scores.mean()) feature_importance = sorted(zip(self.X_train_scaled.columns, feature_list), reverse=True) logging.info( f'Feature importances for best classifier {feature_importance} \n') print_to_consol('Plotting feature importances for best classifier') feature_importances_best_estimator(feature_importance, self.directory) logging.info( f'Plotting feature importances for best classifier in decreasing order \n' )