def generate_plots_for_report(): """ Generate all plots that are used in the report. Notes: - There are some hyperparameters that were tuned on Euler and are used per default. If you want to tune them "manually/on your computer", use the flag -u (see main.py). The number of iterations used for RandomizedSearchCV can be set in classifiers.py - Some plots were modified manually for the report, such as removing titles """ # Plot ROC curve of all classifiers model_factory.plot_roc_curves(True, True, with_lstm=False) _plot_heartrate_change() _plot_difficulties() _plot_mean_value_of_heartrate_at_crash() _plot_feature_correlation_matrix(reduced_features=False) _plot_heartrate_and_events() X, y = f_factory.get_feature_matrix_and_label( verbose=False, use_cached_feature_matrix=True, save_as_pickle_file=True, reduced_features=False, use_boxcox=False ) # Plot example of a Decision Tree by taking first tree of tuned random forest decision_tree_clf = classifiers.get_cclassifier_with_name('Random Forest', X, y).tuned_clf model_factory.get_performance(decision_tree_clf, 'Random Forest', X, y, None, verbose=False, create_curves=False) # Plot ROC curve of Nearest Neighbor classifier (J-Index in report was added manually...) print('Plotting ROC curve of Nearest Neighbor classifier...') nearest_neighbor_clf = classifiers.get_cclassifier_with_name('Nearest Neighbor', X, y).tuned_clf model_factory.get_performance(nearest_neighbor_clf, 'Nearest Neighbor', X, y, None, verbose=False, create_curves=True) # The following plots take a little longer, so only uncomment them if you really want them '''
def _test_clf_with_timedelta_only(): """ (Debugging purposes only). Calculates timedelta feature without using any other features. Since this also gives a good score, the timedelta_feature really is a good predictor! """ print("\n################# Testing classifier using timedelta feature only #################\n") df_list = random.sample(sd.df_list, len(sd.df_list)) # Compute y_true for each logfile y_list = [] for df in df_list: y_true = [] for _, row in df.iterrows(): if (row['Logtype'] == 'EVENT_CRASH') | (row['Logtype'] == 'EVENT_OBSTACLE'): y_true.append(1 if row['Logtype'] == 'EVENT_CRASH' else 0) y_list.append(y_true) # compute feature matrix for each logfile X_matrices = [] for df in df_list: X = [] for _, row in df.iterrows(): if (row['Logtype'] == 'EVENT_CRASH') | (row['Logtype'] == 'EVENT_OBSTACLE'): last_obstacles = df[(df['Time'] < row['Time']) & ((df['Logtype'] == 'EVENT_OBSTACLE') | (df['Logtype'] == 'EVENT_CRASH'))] if last_obstacles.empty: X.append(2) else: X.append(row['Time'] - last_obstacles.iloc[-1]['Time']) X_matrices.append(X) x_train = np.hstack(X_matrices).reshape(-1, 1) # reshape bc. only one feature y_train = np.hstack(y_list).reshape(-1, 1) clf = classifiers.get_cclassifier_with_name('Decision Tree', x_train, y_train).clf score_dict = cross_validate(clf, x_train, y_train, scoring='roc_auc', cv=10) print('Mean roc_auc score with cross_validate: ' + str(np.mean(score_dict['test_score']))) '''
def get_tuned_clf_and_tuned_hyperparameters(X, y, clf_name='svm', verbose=True, pre_set=True): """ This method optimizes hyperparameters with cross-validation using RandomizedSearchCV, optionally creates a ROC curve and returns this optimized classifier and the tuned parameters :param X: Feature matrix :param y: labels :param clf_name: Name of the classifier as given in classifiers.py :param verbose: Whether scores of top hyperparameter configurations should be printed out :param pre_set: Some classifiers have pre_tuned parameters (on Euler). Take those :return: optimized classifier, dictionary of tuned_params """ c_classifier = classifiers.get_cclassifier_with_name(clf_name, X, y) if clf_name == 'Naive Bayes': # Naive Bayes doesn't have any hyperparameters to tune if synthesized_data.synthesized_data_enabled: X_n, y_n = f_factory.get_feature_matrix_and_label( False, False, True, True, False) else: X_n, y_n = f_factory.get_feature_matrix_and_label( True, True, True, True, False) c_classifier.clf.fit(X_n, y_n) return c_classifier.clf, [] else: if pre_set and hasattr(c_classifier, 'tuned_clf'): print('Hyperparameters for ' + clf_name + ' already got tuned, taking those pre-set parameters') return c_classifier.tuned_clf, model_factory.get_tuned_params_dict( c_classifier.tuned_clf, list(c_classifier.tuned_params.keys())) else: print('Doing RandomizedSearchCV with n_iter=' + str(c_classifier.num_iter) + ' for ' + clf_name + '...') start = time.time() scaler = MinMaxScaler(feature_range=(0, 1)) corr = FindCorrelation(threshold=0.9) p = make_pipeline(scaler, corr, c_classifier.clf) params = dict( (c_classifier.estimator_name + '__' + key, value) for (key, value) in c_classifier.tuned_params.items()) clf = RandomizedSearchCV(p, params, cv=3, scoring='roc_auc', n_iter=c_classifier.num_iter) clf.fit(X, y) end = time.time() print("Time elapsed for hyperparameter tuning: " + str(end - start)) if verbose: _report(clf.cv_results_) clf = clf.best_estimator_.steps[2][1] # Unwrap pieline object return clf, model_factory.get_tuned_params_dict( clf, list(c_classifier.tuned_params.keys()))
def main(args): """ Call '$ python main.py -h' to see how to use this module :param args: ArgumentParser """ start = time.time() assert (not (args.use_synthesized_data and args.leave_one_group_out)), \ 'Can\'t do leave_one_group_out with synthesized data' if args.use_synthesized_data: print('Creating synthesized data...') synthesized_data.init_with_testdata_events_const_hr_const() X, y = f_factory.get_feature_matrix_and_label( verbose=True, use_cached_feature_matrix=False, save_as_pickle_file=False, reduced_features=False) else: setup_dataframes.setup( fewer_data=args. debugging, # Specify if we want fewer data (for debugging purposes...) normalize_heartrate=(not args.do_not_normalize_heartrate), remove_tutorials=False) X, y = f_factory.get_feature_matrix_and_label( verbose=True, use_cached_feature_matrix=True, save_as_pickle_file=True, reduced_features=False, use_boxcox=False) if args.print_keynumbers_logfiles: print("\n################# Printing keynumbers #################\n") setup_dataframes.print_keynumbers_logfiles() if args.test_windows: print("\n################# Window optimization #################\n") window_optimization.performance_score_for_windows( args.test_windows[0], args.test_windows[1], args.test_windows[2], verbose=True, write_to_file=True, ) if args.performance_without_tuning or args.performance_with_tuning: pre_set = not args.do_not_use_pre_tuned_hyperparameters if args.performance_with_tuning: print( "\n################# Calculating performance with hyperparameter tuning #################\n" ) else: print( "\n################# Calculating performance without hyperparameter tuning #################\n" ) # Note: The number of iterations in RandomizedSearchCV can be set in classifiers.py if args.performance_without_tuning == 'all' or args.performance_with_tuning == 'all': model_factory. \ calculate_performance_of_classifiers(X, y, tune_hyperparameters=args.performance_with_tuning, reduced_clfs=True, pre_set=pre_set) else: X_old = X y_old = y if (args.performance_with_tuning == 'Naive Bayes') or (args.performance_without_tuning == 'Naive Bayes'): X, y = f_factory.get_feature_matrix_and_label( verbose=False, use_cached_feature_matrix=True, save_as_pickle_file=True, use_boxcox=True, reduced_features=False) if args.performance_with_tuning: clf, tuned_params = hyperparameter_optimization.get_tuned_clf_and_tuned_hyperparameters( X, y, clf_name=args.performance_with_tuning, pre_set=pre_set, ) _, _, _, _, _, _, _, _, _, _, _, report = model_factory.get_performance( clf, args.performance_with_tuning, X, y, tuned_params, verbose=True, do_write_to_file=False) else: model = classifiers.get_cclassifier_with_name( args.performance_without_tuning, X, y) _, _, _, _, _, _, _, _, _, _, _, report = model_factory.get_performance( model.clf, args.performance_without_tuning, X, y, verbose=True, do_write_to_file=False) X = X_old y = y_old print(report) if args.leave_one_group_out: print("\n################# Leave one out #################\n") leave_one_group_out_cv.clf_performance_with_user_left_out_vs_normal( X, y, True, reduced_features=False, reduced_classifiers=True) if args.evaluate_lstm: print("\n################# Get trained LSTM #################\n") LSTM.get_performance_of_lstm_classifier(X, y, n_epochs=args.evaluate_lstm[0]) # LSTM.get_finalscore(X, y, n_epochs=args.evaluate_lstm[0]) if args.generate_plots_about_features: print( "\n################# Generate plots about features #################\n" ) plot_features(X, y) if args.generate_plots_about_logfiles: print( "\n################# Generate plots about logfiles #################\n" ) plot_logfiles(args) if args.generate_plots_for_report: print( "\n################# Generate plots for report #################\n" ) plots_report.generate_plots_for_report() end = time.time() print("Time elapsed: " + str(end - start))
def test_all_windows(): """ Keeps one window fixed and changes the other two. Calculates the roc_auc of the Random Forest with pre-tuned parameters for each window combination and plots it. """ print("\n################# Testing all window sizes #################\n") const_window = 'cw' const_w = 10 list_1 = [5, 10, 20, 30, 50, 60] list_2 = list_1[::-1] if const_window == 'hw': name1 = 'Crash window (s)' name2 = 'Gradient window (s)' filename = 'windows_const_hw.pdf' elif const_window == 'cw': name1 = 'Default window (s)' name2 = 'Gradient window (s)' filename = 'windows_const_cw.pdf' else: name1 = 'Crash window' name2 = 'Default window' filename = 'windows_const_gradient_w.pdf' mean_scores = np.zeros((len(list_1), len(list_2))) model_name = 'Nearest Neighbor' for idx_w1, w1 in enumerate(list_1): for idx_w2, w2 in enumerate(list_2): if const_window == 'hw': X, y = f_factory.get_feature_matrix_and_label( verbose=True, use_cached_feature_matrix=True, save_as_pickle_file=True, h_window=const_w, c_window=w1, gradient_window=w2, reduced_features=False) model = classifiers.get_cclassifier_with_name( model_name, X, y).tuned_clf roc_auc_mean, roc_auc_std, _, _, _, _, _, _, _, _, _, _ = model_factory. \ get_performance(model, model_name, X, y, tuned_params_keys=None, verbose=False, create_curves=False) mean_scores[idx_w1][idx_w2] = roc_auc_mean elif const_window == 'cw': X, y = f_factory.get_feature_matrix_and_label( verbose=True, use_cached_feature_matrix=True, save_as_pickle_file=True, h_window=w1, c_window=const_w, gradient_window=w2, reduced_features=False) model = classifiers.get_cclassifier_with_name( model_name, X, y).tuned_clf roc_auc_mean, roc_auc_std, _, _, _, _, _, _, _, _, _, _ = model_factory. \ get_performance(model, model_name, X, y, tuned_params_keys=None, verbose=False, create_curves=False) mean_scores[idx_w1][idx_w2] = roc_auc_mean else: X, y = f_factory.get_feature_matrix_and_label( verbose=True, use_cached_feature_matrix=True, save_as_pickle_file=True, h_window=w1, c_window=w2, gradient_window=const_w, reduced_features=False) model = classifiers.get_cclassifier_with_name( model_name, X, y).tuned_clf roc_auc_mean, roc_auc_std, _, _, _, _, _, _, _, _, _, _ = model_factory. \ get_performance(model, model_name, X, y, tuned_params_keys=None, verbose=False, create_curves=False) mean_scores[idx_w1][idx_w2] = roc_auc_mean mean_scores = np.fliplr( np.flipud(mean_scores)) # Flip to plot it correctly # Plot elements plt.subplot() plt.imshow(mean_scores, cmap='RdYlGn') plt.title('Average classifier performance when using constant ' + const_window) ax = plt.gca() ax.set_xticks(np.arange(0, len(list_1), 1)) ax.set_yticks(np.arange(0, len(list_2), 1)) ax.set_xticklabels(list_1) ax.set_yticklabels(list_2) ax.set_ylabel(name1) ax.set_xlabel(name2) plt.colorbar() plots_helpers.save_plot(plt, 'Performance/Windows/', filename)
def plot_roc_curves(hyperparameter_tuning=False, pre_set=True, with_lstm=False): """ Plots roc_curves for all classifier in one single plot :param hyperparameter_tuning: Do hyperparameter tuning :param pre_set: Some classifiers have pre_tuned parameters (on Euler). Take those instead of tuning :param with_lstm: Also include ROC of LSTM network (takes a little time...) Folder: Report/ Plot name: roc_curves.pdf """ X, y = f_factory.get_feature_matrix_and_label( verbose=False, use_cached_feature_matrix=True, save_as_pickle_file=True, reduced_features=False, use_boxcox=False ) clf_names = ['SVM', 'Nearest Neighbor', 'Random Forest', 'Naive Bayes'] if pre_set: clf_list = [classifiers.get_cclassifier_with_name(name, X, y).tuned_clf for name in clf_names] else: clf_list = [classifiers.get_cclassifier_with_name(name, X, y).clf for name in clf_names] tprs = [] fprs = [] roc_aucs = [] for idx, classifier in enumerate(clf_list): if hyperparameter_tuning: classifier, _ = hyperparameter_optimization.get_tuned_clf_and_tuned_hyperparameters( X, y, clf_name=clf_names[idx], verbose=False, pre_set=True ) # clf = CalibratedClassifierCV(classifier) clf = classifier kf = KFold(n_splits=10) predicted_probas_list = [] y = np.array(y) for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] scaler = MinMaxScaler(feature_range=(0, 1)) X_train = scaler.fit_transform(X_train) # Fit and transform on trainig set, then transform test set too X_test = scaler.transform(X_test) corr = FindCorrelation(threshold=0.9) X_train = corr.fit(X_train).transform(X_train) X_test = corr.transform(X_test) clf.fit(X_train, y_train) predicted_probas = clf.predict_proba(X_test) predicted_probas_list.append(predicted_probas[:, 1]) fpr, tpr, _ = roc_curve(y, list(itertools.chain.from_iterable(predicted_probas_list))) roc_auc = auc(fpr, tpr) fprs.append(fpr) tprs.append(tpr) roc_aucs.append(roc_auc) # Also add LSTM scores: if with_lstm: clf_names.append("LSTM") fpr, tpr, roc_auc = LSTM.create_roc_curve(X, y, 130) fprs.append(fpr) tprs.append(tpr) roc_aucs.append(roc_auc) plt.figure() for idx, name in enumerate(clf_names): plt.plot(fprs[idx], tprs[idx], label=name + ' (AUC = %0.2f)' % roc_aucs[idx]) plt.title('Roc curves') plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], c='gray', ls='--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plots_helpers.save_plot(plt, 'Report/', 'roc_curves.pdf')
def calculate_performance_of_classifiers(X, y, tune_hyperparameters=False, reduced_clfs=True, create_barchart=True, create_curves=True, do_write_to_file=True, pre_set=False): """Computes performance (roc_auc, recall, specificity, precision, confusion matrix) of either all or only reduced classifiers, and optionally writes it into a file and plots roc_auc scores in a barchart. :param X: Feature matrix :param y: labels :param tune_hyperparameters: Whether or not hyperparameter should be tuned :param reduced_clfs: All classifiers, or only SVM, Nearest Neighbor, Random Forest and Naive Bayes :param create_barchart: Create a barchart consisting of the roc_auc scores :param create_curves: Create roc_curves and precision_recall curve :param do_write_to_file: Write summary of performance into a file (optional) :param pre_set: Some classifiers have pre_tuned parameters (on Euler). Take those instead of tuning :return list of roc_aucs, list of roc_auc_stds (one score for each classifier) and formatted string of scores """ if reduced_clfs: clf_names = classifiers.reduced_names else: clf_names = classifiers.names clf_list = [classifiers.get_cclassifier_with_name(name, X, y).clf for name in clf_names] if tune_hyperparameters or pre_set: clf_list = [hyperparameter_optimization.get_tuned_clf_and_tuned_hyperparameters(X, y, name, verbose=False, pre_set=pre_set)[0] for name in clf_names] scores_mean = [] scores_std = [] names = [] tuned_params = [] conf_mats = [] windows = str(f_factory.hw) + '_' + str(f_factory.cw) + '_' + str(f_factory.gradient_w) filename = 'clf_performances_with_hp_tuning_' + windows if tune_hyperparameters \ else 'clf_performances_without_hp_tuning_' + windows for idx, clf in enumerate(clf_list): tuned_parameters = classifiers.get_cclassifier_with_name(clf_names[idx], X, y).tuned_params clf_name = clf_names[idx] names.append(clf_name) if clf_name == 'Naive Bayes': # Naive Bayes doesn't have any hyperparameters to tune X_n, y_n = f_factory.get_feature_matrix_and_label(True, True, True, True, False) roc_auc, roc_auc_std, recall, recall_std, specificity, specificity_std, precision, precision_std, \ f1, f1_std, conf_mat, _ = get_performance(clf, clf_name, X_n, y_n, create_curves=create_curves) else: roc_auc, roc_auc_std, recall, recall_std, specificity, specificity_std, precision, precision_std, f1, \ f1_std, conf_mat, _ = get_performance(clf, clf_name, X, y, tuned_parameters, create_curves=create_curves) scores_mean.append([roc_auc, recall, specificity, precision, f1]) scores_std.append([roc_auc_std, recall_std, specificity_std, precision_std, f1_std]) tuned_params.append(get_tuned_params_dict(clf, tuned_parameters)) conf_mats.append(conf_mat) if create_barchart: title = 'Scores by classifier with hyperparameter tuning' if tune_hyperparameters \ else 'Scores by classifier without hyperparameter tuning' _plot_barchart_scores(names, [s[0] for s in scores_mean], [s[0] for s in scores_std], title, filename + '.pdf') s = '' roc_scores = [s[0] for s in scores_mean] roc_scores_std = [s[0] for s in scores_std] recall_scores = [s[1] for s in scores_mean] recall_scores_std = [s[1] for s in scores_std] specifity_scores = [s[2] for s in scores_mean] specifity_scores_std = [s[2] for s in scores_mean] precision_scores = [s[3] for s in scores_mean] precision_scores_std = [s[3] for s in scores_std] f1_scores = [s[4] for s in scores_mean] f1_scores_std = [s[4] for s in scores_std] for i, name in enumerate(names): s += create_string_from_scores(name, roc_scores[i], roc_scores_std[i], recall_scores[i], recall_scores_std[i], specifity_scores[i], specifity_scores_std[i], precision_scores[i], precision_scores_std[i], f1_scores[i], f1_scores_std[i], conf_mats[i], tuned_params[i]) if do_write_to_file: write_to_file(s, 'Performance/', filename + '.txt', 'w+') return roc_scores, roc_scores_std, s
def clf_performance_with_user_left_out_vs_normal(X, y, plot_auc_score_per_user=True, reduced_features=False, reduced_classifiers=True, pre_set=True): """ Plots a barchart with the mean roc_auc score for each classfier in two scenarios: 1. Do normal crossvalidation to get roc_auc (There can thus be part of a users logfile in the training set AND in the testset. This could influence the performance on the testset as the model has already seen part of the users data/behavior in the training set) 2. For the training_data, use all but one user, and then predict score on the last user that was NOT used in the training phase! :param X: Feature matrix :param y: labels :param plot_auc_score_per_user: Whether or not we should create a plot for each user left out with the auc_score of each classifier when using LeaveOneGroupOut cross validation :param reduced_features: Whether we should use all features or do feature selection first :param reduced_classifiers: Only use reduced classifiers (see classifiers.py) :param pre_set: Some classifiers have pre_tuned parameters (on Euler). Take those isntead of tuning Folder: Report/ Plot name: clf_performance_with_user_left_out_vs_normal.pdf """ if reduced_classifiers: clf_names = classifiers.reduced_names else: clf_names = classifiers.names if pre_set: clf_list = [ classifiers.get_cclassifier_with_name(name, X, y).tuned_clf for name in clf_names ] else: clf_list = [ classifiers.get_cclassifier_with_name(name, X, y).clf for name in clf_names ] # Get scores for scenario 1 (normal crossvalidation) print('\n***** Scenario 1 (normal crossvalidation) *****\n') auc_scores_scenario_1, auc_stds_scenario_1, s = model_factory. \ calculate_performance_of_classifiers(X, y, tune_hyperparameters=True, reduced_clfs=reduced_classifiers, create_curves=False, do_write_to_file=False, pre_set=True) # Get scores for scenario 2 (Leave one user out in training phase) print( '\n***** Scenario 2 (Leave one user out in training phase) ***** \n') auc_scores_scenario_2 = [] auc_stds_scenario_2 = [] for name, classifier in zip(clf_names, clf_list): print('Calculating performance of %s with doing LeaveOneGroupOut ...' % name) # If NaiveBayes classifier is used, then use Boxcox since features must be gaussian distributed if name == 'Naive Bayes': feature_selection = 'selected' if reduced_features else 'all' X_nb, y_nb = f_factory.get_feature_matrix_and_label( verbose=False, use_cached_feature_matrix=feature_selection, save_as_pickle_file=True, use_boxcox=True, reduced_features=False) classifier.fit(X_nb, y_nb) auc_mean, auc_std = _apply_cv_per_user_model( classifier, name, X_nb, y_nb, plot_auc_score_per_user) else: classifier.fit(X, y) auc_mean, auc_std = _apply_cv_per_user_model( classifier, name, X, y, plot_auc_score_per_user) auc_scores_scenario_2.append(auc_mean) auc_stds_scenario_2.append(auc_std) _plot_scores_normal_cv_vs_leaveone_group_out_cv(clf_names, auc_scores_scenario_1, auc_stds_scenario_1, auc_scores_scenario_2, auc_stds_scenario_2)