def classify_separated_test_directory(): print( "\nDo classification with different training directory and test directory" ) print("\nTraining directory: " + config.get_record_dir()) print("\nTest directory: " + SEPARATED_TEST_RECORD_DIR) file_contents, labels = input_parser.parse_input_files( config.get_record_dir(), combine_sc_vectors=True) test_data, test_labels = input_parser.parse_input_files( SEPARATED_TEST_RECORD_DIR, combine_sc_vectors=True) if len(file_contents) != len(test_data): raise ValueError( "Different number of input files in training directory and test directory - must be equal" ) train_index = list(range(0, len(labels))) test_index = list(range(len(labels), len(labels) + len(test_labels))) # Append test data to training data for idx in range(0, len(file_contents)): training_file_content = file_contents[idx] test_file_content = test_data[idx] training_file_content.records.extend(test_file_content.records) labels.extend(test_labels) X = ft.extract_preconfigured_features(file_contents) Y = pd.Series(labels) Y_test, predictions, accuracy = app_classifier.do_classification( X, Y, train_index, test_index) print(classification_report(Y_test, predictions))
def get_used_side_channels(file_dict): used_side_channels = [] if config.USE_TARGETED_SIDECHANNELS: print( "\nUse targeted side channels in target directory " + config.get_record_dir() + ": ", config.TARGETED_SIDECHANNELS) for side_channel in config.TARGETED_SIDECHANNELS: used_side_channels.append(get_file(file_dict, side_channel)) else: print("\nUse " + str(len(file_dict)) + " side channels in target directory " + config.get_record_dir()) for file_name, file_content in file_dict.items(): print("Use side channel " + file_name) used_side_channels.append(file_content) return used_side_channels
def main(): timing.start_measurement() print("Do combined classification using all input files") file_contents, labels = input_parser.parse_input_files( config.get_record_dir(), combine_sc_vectors=True) X = ft.extract_preconfigured_features(file_contents) Y = pd.Series(labels) app_classifier.do_kfold_cross_validation(X, Y) timing.stop_measurement()
def explorative_classification(): file_contents, label_list = input_parser.parse_input_files(config.get_record_dir(), combine_sc_vectors=False) results = [] for idx, fc in enumerate(file_contents): labels = label_list[idx] print("\nEvaluate ", fc.file_name) X = [fc] Y = pd.Series(labels) total_accuracy = app_classifier.do_kfold_cross_validation(X, Y, verbose=False) results.append(ClassificationResult(total_accuracy, fc.file_name)) results.sort(key = lambda classificationResult: classificationResult.accuracy, reverse=True) print("\nSummary for files in " + config.get_record_dir() + ":\n") for r in results: print(r)
def main(): timing.start_measurement() print("Do combined classification using all input files") file_contents, labels = input_parser.parse_input_files( config.get_record_dir(), combine_sc_vectors=True) X = ft.extract_preconfigured_features(file_contents) Y = pd.Series(labels) _, total_first_acc, total_second_acc, total_third_acc, total_single_accuracies = app_classifier.do_kfold_cross_validation( X, Y) total_acc = [total_first_acc, total_second_acc, total_third_acc] plt.plot(total_acc) plt.show() plt.plot(total_single_accuracies) plt.show() timing.stop_measurement()
def explorative_classification(): file_contents, label_list = input_parser.parse_input_files( config.get_record_dir(), combine_sc_vectors=False) results = [] results_first = [] results_second = [] results_third = [] single_results = [] # print("file content") # for idx, fc in enumerate(file_contents): # print(str(idx) + " " + str(fc.file_name)) # print("labellist") # print (label_list) for idx, fc in enumerate(file_contents): labels = label_list[idx] print("\nEvaluate ", fc.file_name) # print("labels") # print(labels) X = [fc] Y = pd.Series(labels) # print("Y") # print(Y) total_accuracy, total_first_acc, total_second_acc, total_third_acc, total_single_accuracies = app_classifier.do_kfold_cross_validation( X, Y, verbose=True, file_name=fc.file_name[:-4]) results.append( ClassificationResult(round_float(total_accuracy), fc.file_name)) results_first.append( ClassificationResult(round_float(total_first_acc), fc.file_name)) results_second.append( ClassificationResult(round_float(total_second_acc), fc.file_name)) results_third.append( ClassificationResult(round_float(total_third_acc), fc.file_name)) single_results.append([]) for total_single_accuracy in total_single_accuracies: single_results[idx].append( ClassificationResult(round_float(total_single_accuracy), fc.file_name)) results.sort( key=lambda classification_result: classification_result.accuracy, reverse=True) results_first.sort( key=lambda classificationResult: classificationResult.accuracy, reverse=True) results_second.sort( key=lambda classificationResult: classificationResult.accuracy, reverse=True) results_third.sort( key=lambda classificationResult: classificationResult.accuracy, reverse=True) # for single_result in single_results: # single_result.sort(key=lambda classification_result: classification_result.accuracy, reverse=True) print("\nSummary for files in " + config.get_record_dir() + ":\n") for r in results: print(r) print("\nSummary of first for files in " + config.get_record_dir() + ":\n") for r in results_first: print(r) print("\nSummary of second for files in " + config.get_record_dir() + ":\n") for r in results_second: print(r) print("\nSummary of third for files in " + config.get_record_dir() + ":\n") for r in results_third: print(r) for single_result in zip(single_results): if not os.path.exists(config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR): os.makedirs(config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR) # print("\nSummary of for files in " + config.get_record_dir() + ":\n") for r_1 in single_result: file = open( config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR + r_1[0].file_name, "w") for idx, r in enumerate(r_1): file.write(str(idx + 1) + ", " + str(r.accuracy) + "\n") file.close()
def do_kfold_cross_validation(X, Y, verbose=True, file_name=""): folds = config.FOLDS printv("\nSelecting rows for " + str(folds) + "-fold validation", verbose) kf = StratifiedKFold(n_splits=folds, shuffle=True) kf.get_n_splits() # Initialize classification performance measures unique_labels = Y.unique() cnf_mat = pd.DataFrame(np.zeros((len(unique_labels), len(unique_labels))), columns=unique_labels) # print("-------------------------- cnf_mat --------------------------") # print(cnf_mat) cnf_mat.set_index(keys=unique_labels, inplace=True) # print("-------------------------- cnf_mat 2 --------------------------") # print(cnf_mat) Y_test_all_folds = [] predictions_all_folds = [] summed_accuracy = 0 summed_first_acc = 0 summed_second_acc = 0 summed_third_acc = 0 summed_single_accuracies = [] fold_cnt = 1 firstFileContent = X[0] split_var = firstFileContent.records dist_matrices = precomputed_knn_selector.init_dist_matrices(X) # print("-------------------------- dist_matrices --------------------------") # print(dist_matrices) for train_indices, test_indices in kf.split(split_var, Y): printv("\nFold: " + str(fold_cnt), verbose) Y_test, predictions, accuracy, acc_first, acc_second, acc_third, single_accuracies = do_classification(X, Y, train_indices, test_indices, dist_matrices) if verbose: for idx, pred in enumerate(predictions): cnf_mat.ix[Y_test.iloc[idx], pred] += 1 printv("Accuracy:" + str(accuracy), verbose) summed_accuracy += accuracy summed_first_acc += acc_first summed_second_acc += acc_second summed_third_acc += acc_third for idx, single_accuracy in enumerate(single_accuracies): if idx >= len(summed_single_accuracies): summed_single_accuracies.append(single_accuracy) else: summed_single_accuracies[idx] += single_accuracy fold_cnt += 1 Y_test_all_folds.extend(Y_test.values.tolist()) predictions_all_folds.extend(predictions.values.tolist()) total_accuracy = summed_accuracy / folds total_first_acc = summed_first_acc / folds total_second_acc = summed_second_acc / folds total_third_acc = summed_third_acc / folds total_single_accuracies = [] for idx, summed_single_accuracy in enumerate(summed_single_accuracies): total_single_accuracies.append(summed_single_accuracy / folds) if verbose: classification_rep = classification_report(Y_test_all_folds, predictions_all_folds) printv(classification_rep, verbose) if not os.path.exists(config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR): os.makedirs(config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR) file = open(config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR + file_name + "_classification_report.txt", "w") file.write(classification_rep) file.close() print("\nTotal accuracy over all folds: " + str(total_accuracy)) print("Total 1st accuracy over all folds: " + str(total_first_acc)) print("Total 2nd accuracy over all folds: " + str(total_second_acc)) print("Total 3rd accuracy over all folds: " + str(total_third_acc)) print("Total single accuracies over all folds:", total_single_accuracies) if verbose: # plot_confusion_matrix.show_confusion_matrix(cnf_mat.values.astype(int), unique_labels) if not os.path.exists(config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR): os.makedirs(config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR) # print("\nSummary of for files in " + config.get_record_dir() + ":\n") cnf_mat.to_csv(config.RECORD_BASE_DIR + config.get_record_dir() + config.RESULTS_DIR + file_name + "_confusion_matrix.txt", sep=' ') return total_accuracy, total_first_acc, total_second_acc, total_third_acc, total_single_accuracies