示例#1
0
def preprocess(corpus_folder, mode, settings, out_file, verbose_level):
    preprocessor.format_corpus(corpus_folder)
    preprocessor.clean_directory(corpus_folder + "_partitioned")

    filter_words = True
    if settings['filter_words'] == 0:
        filter_words = False

    preprocessor.split_files(settings['num_lines_split'],
                             settings['sliding_window_size'], filter_words,
                             corpus_folder, corpus_folder + "_partitioned")

    all_files = load_files(corpus_folder + "_partitioned")

    if verbose_level > 1:
        print(
            "mode : {} filter: {} window size: {} num_lines_split: {}".format(
                mode, filter_words, settings['sliding_window_size'],
                settings['num_lines_split']))

    if mode == SVM:
        run_svm(all_files, settings['svm']['num_runs'], out_file,
                verbose_level)
    elif mode == KMEANS:
        param_dict = {}
        param_dict['n_init'] = settings['kmeans']['n_init']
        param_dict['max_iter'] = settings['kmeans']['max_iter']
        param_dict['tol'] = settings['kmeans']['tol']
        param_dict['num_runs'] = settings['kmeans']['num_runs']
        param_dict['k'] = settings['kmeans']['k']

        run_cluster(all_files, param_dict, out_file, verbose_level)
示例#2
0
def test_sliding_window():
  # list of testing set accuracies
  test_error_list = []
  overlap_list = []

  overlap = 20
  while overlap >= 0: 
    print("overlap num : {}".format(overlap))
    preprocessor.format_corpus("sermons")
    preprocessor.clean_directory("sermons_partitioned")
    preprocessor.split_files(38, overlap, True, "sermons", "sermons_partitioned")

    all_files = load_files("./sermons_partitioned/")
    test_error_list.append(100 - run_svm(all_files, 4, None, 3))
    overlap_list.append(overlap)
    overlap -= 2

  print(test_error_list)
  plot_sliding_window(test_error_list, overlap_list)
示例#3
0
def test_split_num():

  # list of testing set accuracies
  test_error_list = []
  split_num_list = []

  split_num = 38
  while split_num >= 4: 
    print("split num : {}".format(split_num))
    preprocessor.format_corpus("sermons")
    preprocessor.clean_directory("sermons_partitioned")
    preprocessor.split_files(split_num, 0, True, "sermons", "sermons_partitioned")

    all_files = load_files("./sermons_partitioned/")
    test_error_list.append(100 - run_svm(all_files, 4, None, 3))
    split_num_list.append(split_num)
    split_num -= 2
  
  print(test_error_list)
  plot_split_num(test_error_list, split_num_list)
示例#4
0
def test_cluster():
  #n_init_list = [10, 20, 30]
  n_init_list = [10, 30, 50, 70, 90, 110]
  max_iter_list = [200, 300, 400, 500, 600, 700, 800, 900, 1000]
  #tol_list = [1e3, 1e2, 1e1, 1e0, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5]
  tol_list = [1e6, 1e5, 1e4, 1e3, 1e2, 1e1, 1e0, 1e-1, 1e-2]
  k_list = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]

  results_n_init = []
  results_max_iter = []
  results_tol_list = []
  results_k = []
  
  preprocessor.format_corpus("sermons")
  preprocessor.clean_directory("sermons_partitioned")
  preprocessor.split_files(10, 3, True, "sermons", "sermons_partitioned")
  all_files = load_files("./sermons_partitioned/")
  
  print("k...")
  for value in k_list:
    param_dict = {}
    param_dict['n_init'] = n_init_list[1]
    param_dict['max_iter'] = max_iter_list[1]
    param_dict['tol'] = tol_list[7]
    param_dict['num_runs'] = 10
    param_dict['k'] = value

    results_k.append(run_cluster(all_files, param_dict, "tester", 0))
  
  
  print("n_init...")
  for value in n_init_list:
    param_dict = {}
    param_dict['n_init'] = value
    param_dict['max_iter'] = max_iter_list[1]
    param_dict['tol'] = tol_list[7]
    param_dict['num_runs'] = 10
    param_dict['k'] = 5

    results_n_init.append(run_cluster(all_files, param_dict, "tester", 0))
  
  print("max_iter...")
  for value in max_iter_list:
    param_dict = {}
    param_dict['n_init'] = n_init_list[1]
    param_dict['max_iter'] = value
    param_dict['tol'] = tol_list[7]
    param_dict['num_runs'] = 10
    param_dict['k'] = 5

    results_max_iter.append(run_cluster(all_files, param_dict, "tester", 0))
  
  print("tol...")
  for value in tol_list:
    param_dict = {}
    param_dict['n_init'] = n_init_list[1]
    param_dict['max_iter'] = max_iter_list[1]
    param_dict['tol'] = value
    param_dict['num_runs'] = 10
    param_dict['k'] = 5

    results_tol_list.append(run_cluster(all_files, param_dict, "tester", 0))
  

  print(results_n_init)
  print(results_max_iter)
  print(results_tol_list)
  print(results_k)

  plot_cluster_init(results_n_init, n_init_list)
  plot_cluster_iter(results_max_iter, max_iter_list)
  plot_cluster_tol(results_tol_list, tol_list)
  plot_cluster_clusters(results_k, k_list)