def randomised_search(self): print_to_consol('Running randomized search to find best classifier') #create the decision forest clf1 = DecisionTreeClassifier(random_state=20, class_weight='balanced', max_features=self.numf) ada = AdaBoostClassifier(base_estimator=clf1, algorithm="SAMME.R", random_state=55) logging.info(f'Initialised classifier using balanced class weights \n') #set up randomized search param_dict = { 'base_estimator__criterion': ['gini', 'entropy'], 'n_estimators': randint(100, 10000), #number of base estimators to use 'learning_rate': uniform(0.0001, 1.0), 'base_estimator__min_samples_split': randint(2, 20), 'base_estimator__max_depth': randint(1, 10), 'base_estimator__min_samples_leaf': randint(1, 20), 'base_estimator__max_leaf_nodes': randint(10, 20) } logging.info( f'Following parameters will be explored in randomized search \n' f'{param_dict} \n') #building and running the randomized search rand_search = RandomizedSearchCV(ada, param_dict, random_state=5, cv=self.cv, n_iter=self.numc, scoring='accuracy', n_jobs=-1) rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train) best_parameters = rand_search_fitted.best_params_ best_scores = rand_search_fitted.best_score_ logging.info( f'Running randomised search for best patameters of classifier \n' f'Best parameters found: {best_parameters} \n' f'Best accuracy scores found: {best_scores} \n') self.model = rand_search_fitted.best_estimator_ datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') joblib.dump( self.model, os.path.join(self.directory, 'best_predictor_' + datestring + '.pkl')) logging.info(f'Writing best classifier to disk in {self.directory} \n') print_to_consol( 'Getting 95% confidence interval for uncalibrated classifier') alpha, upper, lower = get_confidence_interval( self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test, self.model, self.directory, self.bootiter, 'uncalibrated') logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n' f'for uncalibrated classifier. \n') print_to_consol('Getting feature importances for best classifier') best_clf_feat_import = self.model.feature_importances_ best_clf_feat_import_sorted = sorted(zip(best_clf_feat_import, self.X_train_scaled.columns), reverse=True) logging.info( f'Feature importances for best classifier {best_clf_feat_import_sorted} \n' ) all_clf_feat_import_mean = np.mean( [tree.feature_importances_ for tree in self.model.estimators_], axis=0) all_clf_feat_import_mean_sorted = sorted(zip( all_clf_feat_import_mean, self.X_train_scaled.columns), reverse=True) print_to_consol('Plotting feature importances for best classifier') feature_importances_best_estimator(best_clf_feat_import_sorted, self.directory) logging.info( f'Plotting feature importances for best classifier in decreasing order \n' ) feature_importances_error_bars(self.model, self.X_train_scaled.columns, self.directory) logging.info( f'Plotting feature importances for best classifier with errorbars \n' )
def detailed_analysis(self): print_to_consol( 'Making a confusion matrix for test set classification outcomes') matrix_stats = confusion_matrix_and_stats(self.y_test, self.y_pred, 'before_cal', self.directory) logging.info(f'Detailed analysis of confusion matrix for test set. \n' f'True positives: {matrix_stats["TP"]} \n' f'True negatives: {matrix_stats["TN"]} \n' f'False positives: {matrix_stats["FP"]} \n' f'False negatives: {matrix_stats["FN"]} \n' f'Classification accuracy: {matrix_stats["acc"]} \n' f'Classification error: {matrix_stats["err"]} \n' f'Sensitivity: {matrix_stats["sensitivity"]} \n' f'Specificity: {matrix_stats["specificity"]} \n' f'False positive rate: {matrix_stats["FP-rate"]} \n' f'False negative rate: {matrix_stats["FN-rate"]} \n' f'Precision: {matrix_stats["precision"]} \n' f'F1-score: {matrix_stats["F1-score"]} \n') print_to_consol( 'Plotting precision recall curve for test set class 1 probabilities' ) logging.info( f'Plotting precision recall curve for class 1 in test set probabilities. \n' ) plot_precision_recall_vs_threshold(self.y_test, self.y_pred_proba_ones, self.directory) print_to_consol( 'Plotting ROC curve ad calculating AUC for test set class 1 probabilities' ) logging.info( f'Plotting ROC curve for class 1 in test set probabilities. \n') self.fpr, self.tpr, self.thresholds = plot_roc_curve( self.y_test, self.y_pred_proba_ones, self.directory) AUC = round( roc_auc_score(self.y_test, self.y_pred_proba_ones) * 100, 2) logging.info( f'Calculating AUC for ROC curve for class 1 in test set probabilities: {AUC} \n' ) print_to_consol('Make a radar plot for performance metrics') radar_dict = { 'Classification accuracy': matrix_stats["acc"], 'Classification error': matrix_stats["err"], 'Sensitivity': matrix_stats["sensitivity"], 'Specificity': matrix_stats["specificity"], 'False positive rate': matrix_stats["FP-rate"], 'False negative rate': matrix_stats["FN-rate"], 'Precision': matrix_stats["precision"], 'F1-score': matrix_stats["F1-score"], 'ROC AUC': AUC } plot_radar_chart(radar_dict, self.directory) print_to_consol( 'Exploring probability thresholds, sensitivity, specificity for class 1' ) threshold_dict = evaluate_threshold(self.tpr, self.fpr, self.thresholds) logging.info( f'Exploring different probability thresholds and sensitivity-specificity trade-offs. \n' f'Threshold 0.2: {threshold_dict["0.2"]} \n' f'Threshold 0.3: {threshold_dict["0.3"]} \n' f'Threshold 0.4: {threshold_dict["0.4"]} \n' f'Threshold 0.5: {threshold_dict["0.5"]} \n' f'Threshold 0.6: {threshold_dict["0.6"]} \n' f'Threshold 0.7: {threshold_dict["0.7"]} \n' f'Threshold 0.8: {threshold_dict["0.8"]} \n' f'Threshold 0.9: {threshold_dict["0.9"]} \n') print_to_consol( 'Calibrating classifier and writing to disk; getting new accuracy') self.calibrated_clf, clf_acc = calibrate_classifier( self.model, self.X_cal_scaled, self.y_cal) date = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') joblib.dump( self.calibrated_clf, os.path.join(self.directory, 'best_calibrated_predictor_' + date + '.pkl')) logging.info( f'Calibrated the best classifier with X_cal and y_cal and new accuracy {clf_acc}\n' f'Writing file to disk disk in {self.directory} \n') print_to_consol( 'Getting 95% confidence interval for calibrated classifier') alpha, upper, lower = get_confidence_interval( self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test, self.calibrated_clf, self.directory, self.bootiter, 'calibrated') logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n' f'for calibrated classifier. \n') print_to_consol('Running prediction for calibrated classifier') print_to_consol( 'Getting class predictions and probabilities for test set with calibrated classifier' ) test_stats_cal, self.y_pred_cal, self.y_pred_proba_cal = testing_predict_stats( self.calibrated_clf, self.X_test_scaled, self.y_test) logging.info( f'Predicting on the test set with calibrated classifier. \n' f'Storing classes for calibrated classifier in y_pred and probabilities in y_pred_proba. \n' ) print_to_consol( 'Calculate prediction stats for y_pred and y_pred_proba of test set with calibrated classifier' ) logging.info( f'Basic stats on the test set woth calibrated classifier. \n' f'Prediction accuracy on the test set: {test_stats_cal["predict_acc"]} \n' f'Class distributio in the test set: {test_stats_cal["class_distribution"]} \n' f'Matthews Correlation Coefficient: {test_stats_cal["mcc"]} \n' f'Average number of class 1 samples: {test_stats_cal["class_one"]} \n' f'Average number of class 0 samples: {test_stats_cal["class_zero"]} \n' f'Null accuracy: {test_stats_cal["null_acc"]} \n') print_to_consol( 'Plotting histogram for class 1 prediction probabilities for test set' ) #store the predicted probabilities for class 1 of test set self.y_pred_proba_cal_ones = self.y_pred_proba_cal[:, 1] plot_hist_pred_proba(self.y_pred_proba_cal_ones, self.directory) logging.info( f'Plotting prediction probabilities for class 1 in test set in histogram for calibrated classifier. \n' ) print_to_consol( 'Making a confusion matrix for test set classification outcomes with calibrated classifier' ) matrix_stats_cal = confusion_matrix_and_stats(self.y_test, self.y_pred_cal, 'after_cal', self.directory) logging.info( f'Detailed analysis of confusion matrix for test set with calibrated classifier. \n' f'True positives: {matrix_stats_cal["TP"]} \n' f'True negatives: {matrix_stats_cal["TN"]} \n' f'False positives: {matrix_stats_cal["FP"]} \n' f'False negatives: {matrix_stats_cal["FN"]} \n' f'Classification accuracy: {matrix_stats_cal["acc"]} \n' f'Classification error: {matrix_stats_cal["err"]} \n' f'Sensitivity: {matrix_stats_cal["sensitivity"]} \n' f'Specificity: {matrix_stats_cal["specificity"]} \n' f'False positive rate: {matrix_stats_cal["FP-rate"]} \n' f'False negative rate: {matrix_stats_cal["FN-rate"]} \n' f'Precision: {matrix_stats_cal["precision"]} \n' f'F1-score: {matrix_stats_cal["F1-score"]} \n') print_to_consol( 'Plotting precision recall curve for test set class 1 probabilities with calibrated classifier' ) logging.info( f'Plotting precision recall curve for class 1 in test set probabilities with calibrated classifier. \n' ) plot_precision_recall_vs_threshold(self.y_test, self.y_pred_proba_cal_ones, self.directory) print_to_consol( 'Plotting ROC curve ad calculating AUC for test set class 1 probabilities with calibrated classifier' ) logging.info( f'Plotting ROC curve for class 1 in test set probabilities with calibrated classifier. \n' ) self.fpr_cal, self.tpr_cal, self.thresholds_cal = plot_roc_curve( self.y_test, self.y_pred_proba_cal_ones, self.directory) AUC_cal = round( roc_auc_score(self.y_test, self.y_pred_proba_cal_ones) * 100, 2) logging.info( f'Calculating AUC for ROC curve for class 1 in test set probabilities with calibrated classifier: {AUC_cal} \n' ) print_to_consol( 'Make a radar plot for performance metrics with calibrated classifier' ) radar_dict_cal = { 'Classification accuracy': matrix_stats_cal["acc"], 'Classification error': matrix_stats_cal["err"], 'Sensitivity': matrix_stats_cal["sensitivity"], 'Specificity': matrix_stats_cal["specificity"], 'False positive rate': matrix_stats_cal["FP-rate"], 'False negative rate': matrix_stats_cal["FN-rate"], 'Precision': matrix_stats_cal["precision"], 'F1-score': matrix_stats_cal["F1-score"], 'ROC AUC': AUC_cal } plot_radar_chart(radar_dict_cal, self.directory) print_to_consol( 'Exploring probability thresholds, sensitivity, specificity for class 1 with calibrated classifier' ) threshold_dict_cal = evaluate_threshold(self.tpr_cal, self.fpr_cal, self.thresholds_cal) logging.info( f'Exploring different probability thresholds and sensitivity-specificity trade-offs \n' f'for calibrated classifier. \n' f'Threshold 0.2: {threshold_dict_cal["0.2"]} \n' f'Threshold 0.3: {threshold_dict_cal["0.3"]} \n' f'Threshold 0.4: {threshold_dict_cal["0.4"]} \n' f'Threshold 0.5: {threshold_dict_cal["0.5"]} \n' f'Threshold 0.6: {threshold_dict_cal["0.6"]} \n' f'Threshold 0.7: {threshold_dict_cal["0.7"]} \n' f'Threshold 0.8: {threshold_dict_cal["0.8"]} \n' f'Threshold 0.9: {threshold_dict_cal["0.9"]} \n') end = datetime.now() duration = end - self.start logging.info(f'Training lasted for {duration} minutes \n') logging.info(f'Training completed \n') print_to_consol('Training completed')
def randomised_search(self): print_to_consol('Running randomized search to find best classifier') #create the decision forest clf1 = LogisticRegression(penalty='l2', random_state=20, class_weight='balanced') logging.info(f'Initialised classifier') #set up randomized search param_dict = {'max_iter': randint(100, 10000), 'C': expon(scale=100)} logging.info( f'Following parameters will be explored in randomized search \n' f'{param_dict} \n') #building and running the randomized search rand_search = RandomizedSearchCV(clf1, param_dict, random_state=5, cv=self.cv, n_iter=self.numc, scoring='accuracy', n_jobs=-1) rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train) self.model = rand_search_fitted.best_estimator_ best_parameters = rand_search_fitted.best_params_ coef = self.model.coef_ intercept = self.model.intercept_ n_feat = self.model.n_features_in_ features = self.model.feature_names_in_ logging.info(f'Running randomised search for best patameters of a \n' f'Logistic Regression classifier scoring is accuracy \n' f'Best parameters found: {best_parameters} \n' f'Best coefficient/score: {coef} \n' f'Best intercept/score: {intercept} \n' f'Number of features used for fit: {n_feat} \n' f'Features used for fit: {features} \n') datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') joblib.dump( self.model, os.path.join(self.directory, 'best_predictor_' + datestring + '.pkl')) logging.info(f'Writing best classifier to disk in {self.directory} \n') print_to_consol( 'Getting 95% confidence interval for uncalibrated classifier') alpha, upper, lower = get_confidence_interval( self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test, self.model, self.directory, self.bootiter, 'uncalibrated') logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n' f'for uncalibrated classifier. \n')
def detailed_analysis(self): print_to_consol( 'Making a confusion matrix for test set classification outcomes') matrix_stats, report = confusion_matrix_and_stats_multiclass( self.y_test, self.y_pred, 'before_cal', self.directory) logging.info(f'Detailed analysis of confusion matrix for test set. \n' f'True positives: {matrix_stats["TP"]} \n' f'True negatives: {matrix_stats["TN"]} \n' f'False positives: {matrix_stats["FP"]} \n' f'False negatives: {matrix_stats["FN"]} \n' f'Classification accuracy: {matrix_stats["acc"]} \n' f'Classification error: {matrix_stats["err"]} \n' f'Sensitivity: {matrix_stats["sensitivity"]} \n' f'Specificity: {matrix_stats["specificity"]} \n' f'False positive rate: {matrix_stats["FP-rate"]} \n' f'False negative rate: {matrix_stats["FN-rate"]} \n' f'Precision: {matrix_stats["precision"]} \n' f'F1-score: {matrix_stats["F1-score"]} \n') logging.info( f'Classification report on test set before calibration. \n' f'{report} \n') print_to_consol('Make a radar plot for performance metrics') radar_dict = { 'Classification accuracy': matrix_stats["acc"], 'Classification error': matrix_stats["err"], 'Sensitivity': matrix_stats["sensitivity"], 'Specificity': matrix_stats["specificity"], 'False positive rate': matrix_stats["FP-rate"], 'False negative rate': matrix_stats["FN-rate"], 'Precision': matrix_stats["precision"], 'F1-score': matrix_stats["F1-score"], 'ROC AUC': None } plot_radar_chart(radar_dict, self.directory) print_to_consol( 'Calibrating classifier and writing to disk; getting new accuracy') self.calibrated_clf, clf_acc = calibrate_classifier( self.model, self.X_cal, self.y_cal) date = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') joblib.dump( self.calibrated_clf, os.path.join(self.directory, 'best_calibrated_predictor_' + date + '.pkl')) logging.info( f'Calibrated the best classifier with X_cal and y_cal and new accuracy {clf_acc}\n' f'Writing file to disk disk in {self.directory} \n') print_to_consol( 'Getting 95% confidence interval for calibrated classifier') alpha, upper, lower = get_confidence_interval( self.X_train, self.y_train, self.X_test, self.y_test, self.calibrated_clf, self.directory, self.bootiter, 'calibrated') logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n' f'for calibrated classifier. \n') print_to_consol('Running prediction for calibrated classifier') print_to_consol( 'Getting class predictions and probabilities for test set with calibrated classifier' ) test_stats_cal, self.y_pred_cal, self.y_pred_proba_cal = testing_predict_stats_multiclass( self.calibrated_clf, self.X_test, self.y_test) y_pred_cal_out = os.path.join(self.directory, "y_pred_after_calibration.csv") np.savetxt(y_pred_cal_out, self.y_pred_cal, delimiter=",") y_pred_proba_cal_out = os.path.join( self.directory, "y_pred_proba_after_calibration.csv") np.savetxt(y_pred_proba_cal_out, self.y_pred_proba_cal, delimiter=",") logging.info( f'Writing y_pred and y_pred_proba after calibration to disk. \n' f'Predicting on the test set with calibrated classifier. \n' f'Storing classes for calibrated classifier in y_pred and probabilities in y_pred_proba. \n' ) print_to_consol( 'Calculate prediction stats for y_pred and y_pred_proba of test set with calibrated classifier' ) logging.info( f'Basic stats on the test set woth calibrated classifier. \n' f'Prediction accuracy on the test set: {test_stats_cal["predict_acc"]} \n' f'Class distributio in the test set: {test_stats_cal["class_distribution"]} \n' f'Matthews Correlation Coefficient: {test_stats_cal["mcc"]} \n') print_to_consol( 'Making a confusion matrix for test set classification outcomes with calibrated classifier' ) matrix_stats_cal, report_cal = confusion_matrix_and_stats_multiclass( self.y_test, self.y_pred_cal, 'after_cal', self.directory) logging.info( f'Detailed analysis of confusion matrix for test set with calibrated classifier. \n' f'True positives: {matrix_stats_cal["TP"]} \n' f'True negatives: {matrix_stats_cal["TN"]} \n' f'False positives: {matrix_stats_cal["FP"]} \n' f'False negatives: {matrix_stats_cal["FN"]} \n' f'Classification accuracy: {matrix_stats_cal["acc"]} \n' f'Classification error: {matrix_stats_cal["err"]} \n' f'Sensitivity: {matrix_stats_cal["sensitivity"]} \n' f'Specificity: {matrix_stats_cal["specificity"]} \n' f'False positive rate: {matrix_stats_cal["FP-rate"]} \n' f'False negative rate: {matrix_stats_cal["FN-rate"]} \n' f'Precision: {matrix_stats_cal["precision"]} \n' f'F1-score: {matrix_stats_cal["F1-score"]} \n') logging.info( f'Classification report on test set afetr callibration. \n' f'{report_cal} \n') print_to_consol( 'Make a radar plot for performance metrics with calibrated classifier' ) radar_dict_cal = { 'Classification accuracy': matrix_stats_cal["acc"], 'Classification error': matrix_stats_cal["err"], 'Sensitivity': matrix_stats_cal["sensitivity"], 'Specificity': matrix_stats_cal["specificity"], 'False positive rate': matrix_stats_cal["FP-rate"], 'False negative rate': matrix_stats_cal["FN-rate"], 'Precision': matrix_stats_cal["precision"], 'F1-score': matrix_stats_cal["F1-score"], 'ROC AUC': None } plot_radar_chart(radar_dict_cal, self.directory) end = datetime.now() duration = end - self.start logging.info(f'Training lasted for {duration} minutes \n') logging.info(f'Training completed \n') print_to_consol('Training completed')
def randomised_search(self): print_to_consol('Running randomized search to find best classifier') #create the decision forest clf1 = KNeighborsClassifier() logging.info(f'Initialised classifier \n') #set up randomized search param_dict = { 'n_neighbors': randint(2, 10), 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'leaf_size': randint(2, 50) } logging.info( f'Following parameters will be explored in randomized search \n' f'{param_dict} \n') #building and running the randomized search rand_search = RandomizedSearchCV(clf1, param_dict, random_state=5, cv=self.cv, n_iter=self.numc, scoring='accuracy', n_jobs=-1) rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train) best_parameters = rand_search_fitted.best_params_ best_scores = rand_search_fitted.best_score_ logging.info( f'Running randomised search for best patameters of classifier \n' f'Best parameters found: {best_parameters} \n' f'Best accuracy scores found: {best_scores} \n') self.model = rand_search_fitted.best_estimator_ datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') joblib.dump( self.model, os.path.join(self.directory, 'best_predictor_' + datestring + '.pkl')) logging.info(f'Writing best classifier to disk in {self.directory} \n') print_to_consol( 'Getting 95% confidence interval for uncalibrated classifier') alpha, upper, lower = get_confidence_interval( self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test, self.model, self.directory, self.bootiter, 'uncalibrated') logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n' f'for uncalibrated classifier. \n') print_to_consol('Getting feature importances for best classifier') feature_list = [] for i in range(len(self.X_train_scaled.columns)): X = self.X_train_scaled.iloc[:, i].values.reshape(-1, 1) scores = cross_val_score(self.model, self.X_train_scaled, self.y_train, cv=self.cv) feature_list.append(scores.mean()) feature_importance = sorted(zip(self.X_train_scaled.columns, feature_list), reverse=True) logging.info( f'Feature importances for best classifier {feature_importance} \n') print_to_consol('Plotting feature importances for best classifier') feature_importances_best_estimator(feature_importance, self.directory) logging.info( f'Plotting feature importances for best classifier in decreasing order \n' )
def randomised_search(self): print_to_consol('Running randomized search to find best classifier') #create the decision forest clf1 = GaussianNB(priors=None) logging.info(f'Initialised classifier \n') #set up randomized search param_dict = {'var_smoothing': uniform(0.000000000001, 10.0)} logging.info( f'Following parameters will be explored in randomized search \n' f'{param_dict} \n') #building and running the randomized search rand_search = RandomizedSearchCV(clf1, param_dict, random_state=5, cv=self.cv, n_iter=self.numc, scoring='accuracy', n_jobs=-1) rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train) best_parameters = rand_search_fitted.best_params_ best_scores = rand_search_fitted.best_score_ self.model = rand_search_fitted.best_estimator_ logging.info( f'Running randomised search for best patameters of classifier \n' f'Best parameters found: {best_parameters} \n' f'Best accuracy scores found: {best_scores} \n' f'Probability for each class 0: {self.model.class_prior_} \n' f'Mean for each feature for class 0: {self.model.theta_[0]} \n' f'Mean for each feature for class 1: {self.model.theta_[1]} \n') datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') joblib.dump( self.model, os.path.join(self.directory, 'best_predictor_' + datestring + '.pkl')) logging.info(f'Writing best classifier to disk in {self.directory} \n') print_to_consol( 'Getting 95% confidence interval for uncalibrated classifier') alpha, upper, lower = get_confidence_interval( self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test, self.model, self.directory, self.bootiter, 'uncalibrated') logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n' f'for uncalibrated classifier. \n') print_to_consol('Getting feature importances for best classifier') class0_feature_ls = self.model.theta_[0] class1_feature_ls = self.model.theta_[1] df_0 = pd.DataFrame(class0_feature_ls.reshape(-1, len(class0_feature_ls)), columns=self.X_train_scaled.columns) df_1 = pd.DataFrame(class1_feature_ls.reshape(-1, len(class1_feature_ls)), columns=self.X_train_scaled.columns) feature_importance_class0 = df_0.to_dict(orient='records') feature_importance_class1 = df_1.to_dict(orient='records') logging.info( f'Feature importances for class 0 for best classifier {feature_importance_class0} \n' f'Feature importances for class 1 for best classifier {feature_importance_class1} \n' ) print_to_consol('Plotting feature importances for best classifier') gnb_feature_importances(feature_importance_class0, 'class_0', self.directory) gnb_feature_importances(feature_importance_class1, 'class_1', self.directory) logging.info( f'Plotting feature importances for each class for best classifier in decreasing order \n' )
def randomised_search(self): print_to_consol('Running randomized search to find best classifier') #create the decision forest clf1 = SVC(kernel='rbf', probability=True, random_state=20, class_weight='balanced') logging.info(f'Initialised classifier \n') #set up randomized search param_dict = {'C': expon(scale=100), 'gamma': expon(scale=.1)} logging.info( f'Following parameters will be explored in randomized search \n' f'{param_dict} \n') #building and running the randomized search rand_search = RandomizedSearchCV(clf1, param_dict, random_state=5, cv=self.cv, n_iter=self.numc, scoring='accuracy', n_jobs=-1) rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train) self.model = rand_search_fitted.best_estimator_ best_parameters = rand_search_fitted.best_params_ sv = self.model.support_vectors_ intercept = self.model.intercept_ logging.info( f'Running randomised search for best patameters of a decision tree \n' f'with AdaBoost scoring is accuracy \n' f'Best parameters found: {best_parameters} \n' f'Best coefficient/score: {sv} \n' f'Best intercept/score: {intercept} \n') datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') joblib.dump( self.model, os.path.join(self.directory, 'best_predictor_' + datestring + '.pkl')) logging.info( f'Writing best classifier to disk in {self.directory} \n' f'No feature importances available for this type of predictor \n') print_to_consol( 'Getting 95% confidence interval for uncalibrated classifier') alpha, upper, lower = get_confidence_interval( self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test, self.model, self.directory, self.bootiter, 'uncalibrated') logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n' f'for uncalibrated classifier. \n')
def randomised_search(self): print_to_consol('Running randomized search to find best classifier') #create the decision forest clf1 = SVC(kernel='linear', probability=True, random_state=20, class_weight='balanced') logging.info(f'Initialised classifer \n') #set up randomized search param_dict = {'C': expon(scale=100)} logging.info( f'Following parameters will be explored in randomized search \n' f'{param_dict} \n') #building and running the randomized search rand_search = RandomizedSearchCV(clf1, param_dict, random_state=5, cv=self.cv, n_iter=self.numc, scoring='accuracy', n_jobs=-1) rand_search_fitted = rand_search.fit(self.X_train_scaled, self.y_train) self.model = rand_search_fitted.best_estimator_ best_parameters = rand_search_fitted.best_params_ coef = self.model.coef_ coef_ravel = coef.ravel() intercept = self.model.intercept_ logging.info( f'Running randomised search for best patameters of a decision tree \n' f'Best parameters found: {best_parameters} \n' f'Best coefficient/score: {coef_ravel} \n' f'Best intercept/score: {intercept} \n') datestring = datetime.strftime(datetime.now(), '%Y%m%d_%H%M') joblib.dump( self.model, os.path.join(self.directory, 'best_predictor_' + datestring + '.pkl')) logging.info(f'Writing best classifier to disk in {self.directory} \n') print_to_consol( 'Getting 95% confidence interval for uncalibrated classifier') alpha, upper, lower = get_confidence_interval( self.X_train_scaled, self.y_train, self.X_test_scaled, self.y_test, self.model, self.directory, self.bootiter, 'uncalibrated') logging.info(f'{alpha}% confidence interval {upper}% and {lower}% \n') print_to_consol('Getting feature importances for best classifier') cv = CountVectorizer(lowercase=False) cv.fit(self.X_train_scaled.columns) feature_names = cv.get_feature_names() print_to_consol('Plotting feature importances for best classifier') plot_coefficients(coef_ravel, feature_names, self.directory) logging.info(f'Plotting feature importances for best classifier \n')
def detailed_analysis(self): print_to_consol( 'Making a confusion matrix for test set classification outcomes') matrix_stats = confusion_matrix_and_stats(self.y_test, self.y_pred, self.directory) logging.info(f'Detailed analysis of confusion matrix for test set. \n' f'True positives: {matrix_stats["TP"]} \n' f'True negatives: {matrix_stats["TN"]} \n' f'False positives: {matrix_stats["FP"]} \n' f'False negatives: {matrix_stats["FN"]} \n' f'Classification accuracy: {matrix_stats["acc"]} \n' f'Classification error: {matrix_stats["err"]} \n' f'Sensitivity: {matrix_stats["sensitivity"]} \n' f'Specificity: {matrix_stats["specificity"]} \n' f'False positive rate: {matrix_stats["FP-rate"]} \n' f'False negative rate: {matrix_stats["FN-rate"]} \n' f'Precision: {matrix_stats["precision"]} \n' f'F1-score: {matrix_stats["F1-score"]} \n') print_to_consol( 'Plotting precision recall curve for test set class 1 probabilities' ) logging.info( f'Plotting precision recall curve for class 1 in test set probabilities. \n' ) plot_precision_recall_vs_threshold(self.y_test, self.y_pred_proba_ones, self.directory) print_to_consol( 'Plotting ROC curve ad calculating AUC for test set class 1 probabilities' ) logging.info( f'Plotting ROC curve for class 1 in test set probabilities. \n') self.fpr, self.tpr, self.thresholds = plot_roc_curve( self.y_test, self.y_pred_proba_ones, self.directory) AUC = round( roc_auc_score(self.y_test, self.y_pred_proba_ones) * 100, 2) logging.info( f'Calculating AUC for ROC curve for class 1 in test set probabilities: {AUC} \n' ) print_to_consol('Make a radar plot for performance metrics') radar_dict = { 'Classification accuracy': matrix_stats["acc"], 'Classification error': matrix_stats["err"], 'Sensitivity': matrix_stats["sensitivity"], 'Specificity': matrix_stats["specificity"], 'False positive rate': matrix_stats["FP-rate"], 'False negative rate': matrix_stats["FN-rate"], 'Precision': matrix_stats["precision"], 'F1-score': matrix_stats["F1-score"], 'ROC AUC': AUC } plot_radar_chart(radar_dict, self.directory) print_to_consol( 'Exploring probability thresholds, sensitivity, specificity for class 1' ) threshold_dict = evaluate_threshold(self.tpr, self.fpr, self.thresholds) logging.info( f'Exploring different probability thresholds and sensitivity-specificity trade-offs. \n' f'Threshold 0.2: {threshold_dict["0.2"]} \n' f'Threshold 0.3: {threshold_dict["0.3"]} \n' f'Threshold 0.4: {threshold_dict["0.4"]} \n' f'Threshold 0.5: {threshold_dict["0.5"]} \n' f'Threshold 0.6: {threshold_dict["0.6"]} \n' f'Threshold 0.7: {threshold_dict["0.7"]} \n' f'Threshold 0.8: {threshold_dict["0.8"]} \n' f'Threshold 0.9: {threshold_dict["0.9"]} \n') end = datetime.now() duration = end - self.start logging.info( f'Prediction and analysis lasted for {duration} minutes \n') logging.info(f'Prediction and analysis completed \n') print_to_consol('Prediction and analysis completed')