def preprocess(corpus_folder, mode, settings, out_file, verbose_level): preprocessor.format_corpus(corpus_folder) preprocessor.clean_directory(corpus_folder + "_partitioned") filter_words = True if settings['filter_words'] == 0: filter_words = False preprocessor.split_files(settings['num_lines_split'], settings['sliding_window_size'], filter_words, corpus_folder, corpus_folder + "_partitioned") all_files = load_files(corpus_folder + "_partitioned") if verbose_level > 1: print( "mode : {} filter: {} window size: {} num_lines_split: {}".format( mode, filter_words, settings['sliding_window_size'], settings['num_lines_split'])) if mode == SVM: run_svm(all_files, settings['svm']['num_runs'], out_file, verbose_level) elif mode == KMEANS: param_dict = {} param_dict['n_init'] = settings['kmeans']['n_init'] param_dict['max_iter'] = settings['kmeans']['max_iter'] param_dict['tol'] = settings['kmeans']['tol'] param_dict['num_runs'] = settings['kmeans']['num_runs'] param_dict['k'] = settings['kmeans']['k'] run_cluster(all_files, param_dict, out_file, verbose_level)
def test_sliding_window(): # list of testing set accuracies test_error_list = [] overlap_list = [] overlap = 20 while overlap >= 0: print("overlap num : {}".format(overlap)) preprocessor.format_corpus("sermons") preprocessor.clean_directory("sermons_partitioned") preprocessor.split_files(38, overlap, True, "sermons", "sermons_partitioned") all_files = load_files("./sermons_partitioned/") test_error_list.append(100 - run_svm(all_files, 4, None, 3)) overlap_list.append(overlap) overlap -= 2 print(test_error_list) plot_sliding_window(test_error_list, overlap_list)
def test_split_num(): # list of testing set accuracies test_error_list = [] split_num_list = [] split_num = 38 while split_num >= 4: print("split num : {}".format(split_num)) preprocessor.format_corpus("sermons") preprocessor.clean_directory("sermons_partitioned") preprocessor.split_files(split_num, 0, True, "sermons", "sermons_partitioned") all_files = load_files("./sermons_partitioned/") test_error_list.append(100 - run_svm(all_files, 4, None, 3)) split_num_list.append(split_num) split_num -= 2 print(test_error_list) plot_split_num(test_error_list, split_num_list)
def test_cluster(): #n_init_list = [10, 20, 30] n_init_list = [10, 30, 50, 70, 90, 110] max_iter_list = [200, 300, 400, 500, 600, 700, 800, 900, 1000] #tol_list = [1e3, 1e2, 1e1, 1e0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5] tol_list = [1e6, 1e5, 1e4, 1e3, 1e2, 1e1, 1e0, 1e-1, 1e-2] k_list = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21] results_n_init = [] results_max_iter = [] results_tol_list = [] results_k = [] preprocessor.format_corpus("sermons") preprocessor.clean_directory("sermons_partitioned") preprocessor.split_files(10, 3, True, "sermons", "sermons_partitioned") all_files = load_files("./sermons_partitioned/") print("k...") for value in k_list: param_dict = {} param_dict['n_init'] = n_init_list[1] param_dict['max_iter'] = max_iter_list[1] param_dict['tol'] = tol_list[7] param_dict['num_runs'] = 10 param_dict['k'] = value results_k.append(run_cluster(all_files, param_dict, "tester", 0)) print("n_init...") for value in n_init_list: param_dict = {} param_dict['n_init'] = value param_dict['max_iter'] = max_iter_list[1] param_dict['tol'] = tol_list[7] param_dict['num_runs'] = 10 param_dict['k'] = 5 results_n_init.append(run_cluster(all_files, param_dict, "tester", 0)) print("max_iter...") for value in max_iter_list: param_dict = {} param_dict['n_init'] = n_init_list[1] param_dict['max_iter'] = value param_dict['tol'] = tol_list[7] param_dict['num_runs'] = 10 param_dict['k'] = 5 results_max_iter.append(run_cluster(all_files, param_dict, "tester", 0)) print("tol...") for value in tol_list: param_dict = {} param_dict['n_init'] = n_init_list[1] param_dict['max_iter'] = max_iter_list[1] param_dict['tol'] = value param_dict['num_runs'] = 10 param_dict['k'] = 5 results_tol_list.append(run_cluster(all_files, param_dict, "tester", 0)) print(results_n_init) print(results_max_iter) print(results_tol_list) print(results_k) plot_cluster_init(results_n_init, n_init_list) plot_cluster_iter(results_max_iter, max_iter_list) plot_cluster_tol(results_tol_list, tol_list) plot_cluster_clusters(results_k, k_list)