def multi_proj_feature_classification( parameter_file, file_keyword, function_keyword="multi_proj_feature_classification"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, cnn_obj_folder, cnn_temp_folder, cnn_setting_file = read_feature_classification( parameter_file, function_keyword) log_folder = init_folder(log_folder) if method == 'cnn': return projected_cnn_classification_main(parameter_file, file_keyword) else: # Need to check the rest return False print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, cnn_obj_folder, cnn_temp_folder, cnn_setting_file data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) print obj_folder file_list = list_files(data_folder) obj_list = list_files(obj_folder) class_column = 0 header = True save_obj_folder = obj_folder[:-1] + "_" + method + "_out" save_obj_folder = init_folder(save_obj_folder) delimiter = ' ' loop_count = -1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( class_id) + '_top' + str(top_k) + '_' + method + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('method: ' + method) logger.info('============') found_obj_file = '' for obj_file in obj_list: if file_key in obj_file: found_obj_file = obj_file break if found_obj_file == '': raise Exception('No obj file found') print found_obj_file found_obj_file = obj_folder + found_obj_file feature_array = load_obj(found_obj_file)[0] feature_array = np.array(feature_array) logger.info("feature array shape: " + str(feature_array.shape)) test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) if loop_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) data_stru.attr_num = top_k fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix = run_feature_projected_classification( train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, feature_array, top_k, method, class_id, logger) logger.info("Fold F1: " + str(fold_f1_value)) logger.info(method + ' fold training time (sec):' + str(fold_train_time)) logger.info(method + ' fold testing time (sec):' + str(fold_test_time)) logger.info(method + ' fold accuracy: ' + str(fold_accuracy)) logger.info("save obj to " + save_obj_folder + file_key + "_" + method + "_project_" + method + "_result.ckpt") save_obj([ fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix ], save_obj_folder + file_key + "_" + method + "_project_" + method + "_result.ckpt")
def backward_multitime_main(parameter_file="../../parameters/", file_keyword="train_", n_selected_features=15): function_keyword = "backward_wrapper" #data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_feature_classification(parameter_file, function_keyword) data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_feature_classification( parameter_file, function_keyword) print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file log_folder = init_folder(log_folder) out_obj_folder = init_folder(out_obj_folder) out_model_folder = init_folder(out_model_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) file_count = 0 class_column = 0 header = True delimiter = ' ' loop_count = -1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( class_id) + '_' + method + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('method: ' + method) logger.info('============') test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector = train_test_file_reading( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) n_samples, n_col = train_x_matrix.shape train_x_matrix = train_x_matrix.reshape(n_samples, attr_num, attr_len) n_samples, n_col = test_x_matrix.shape test_x_matrix = test_x_matrix.reshape(n_samples, attr_num, attr_len) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) if class_id == -1: min_class = min(train_y_vector) max_class = max(train_y_vector) + 1 else: min_class = class_id max_class = class_id + 1 for c in range(min_class, max_class): logger.info("Class: " + str(c)) temp_train_y_vector = np.where(train_y_vector == c, 1, 0) temp_test_y_vector = np.where(test_y_vector == c, 1, 0) top_features = backward_multitime( train_x_matrix, temp_train_y_vector, test_x_matrix, temp_test_y_vector, n_selected_features, data_keyword, method, cnn_setting_file, logger) logger.info("Top Features For Class " + str(c) + ": " + str(top_features)) logger.info("End Of Class: " + str(c))
def forward_multitime_main(parameter_file="../../parameters/", file_keyword="train_"): function_keyword = "forward_wrapper" #data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_feature_classification(parameter_file, function_keyword) data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_feature_classification( parameter_file, function_keyword) print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file if data_keyword == "dsa" or data_keyword == "toy": n_selected_features = 15 num_classes = 19 elif data_keyword == "rar": n_selected_features = 30 num_classes = 33 elif data_keyword == "arc" or data_keyword == "fixed_arc": n_selected_features = 30 num_classes = 18 elif data_keyword == "asl": n_selected_features = 6 num_classes = 95 else: raise Exception("Please fullfill the data basic information first!") log_folder = init_folder(log_folder) #out_obj_folder = init_folder(out_obj_folder) #out_model_folder = init_folder(out_model_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) file_count = 0 class_column = 0 header = True delimiter = ' ' loop_count = -1 ########## ###already remove later #already_obj_folder = "../../object/" + data_keyword + "/forward_wrapper/" #already_obj_list = list_files(already_obj_folder) ###end of already remove later for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') #already_obj_file = "" already = False #for already_obj_file in already_obj_list: # if file_key in already_obj_file and method in already_obj_file: # already = True # break ########## ###already part #if already is True: # already_class_feature = load_obj(already_obj_folder + already_obj_file)[0] #else: # log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str(class_id) + '_' + method + '.log' # already_class_feature = None ###end of already part log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( class_id) + '_' + method + "_top" + str( n_selected_features) + '_already' + str(already) + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('method: ' + method) logger.info('============') test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector = train_test_file_reading( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) n_samples, n_col = train_x_matrix.shape train_x_matrix = train_x_matrix.reshape(n_samples, attr_num, attr_len) n_samples, n_col = test_x_matrix.shape test_x_matrix = test_x_matrix.reshape(n_samples, attr_num, attr_len) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) min_class = min(train_y_vector) max_class = max(train_y_vector) + 1 for c in range(min_class, max_class): logger.info("Class: " + str(c)) already_feature = [] #if already_class_feature is not None: # class_already = already_class_feature[c, :] # for already_f in class_already: # already_feature.append(already_f) # logger.info("already features: " +file_key + " with class " + str(c) + ": " + str(already_feature)) temp_train_y_vector = np.where(train_y_vector == c, 1, 0) temp_test_y_vector = np.where(test_y_vector == c, 1, 0) #print already_feature top_features = forward_multitime( train_x_matrix, temp_train_y_vector, test_x_matrix, temp_test_y_vector, n_selected_features, data_keyword, file_key, method, cnn_setting_file, logger, already_feature) logger.info("Top Features For Class " + str(c) + ": " + str(top_features)) logger.info("End Of Class: " + str(c))
def best_forward_multitime_main(parameter_file="../../parameters/", file_keyword="train_", function_keyword="best_forward_multitime"): #data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_all_feature_classification(parameter_file, function_keyword) data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_feature_classification(parameter_file, function_keyword) print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file function_keyword = function_keyword + "_" + method if data_keyword == "dsa" or data_keyword == "toy": n_selected_features = 15 num_classes = 19 elif data_keyword == "rar": n_selected_features = 30 num_classes = 33 elif data_keyword == "arc" or data_keyword == "fixed_arc": n_selected_features = 30 num_classes = 18 elif data_keyword == "asl": n_selected_features = 6 num_classes = 95 else: raise Exception("Please fullfill the data basic information first!") keep_k = 5 log_folder = init_folder(log_folder) #out_obj_folder = init_folder(out_obj_folder) #out_model_folder = init_folder(out_model_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) file_count = 0 class_column = 0 header = True delimiter = ' ' loop_count = -1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str(class_id) + '_' + method + "_top" + str(n_selected_features) +'.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('method: ' + method) logger.info('============') test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector = train_test_file_reading( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) n_samples, n_col = train_x_matrix.shape train_x_matrix = train_x_matrix.reshape(n_samples, attr_num, attr_len) n_samples, n_col = test_x_matrix.shape test_x_matrix = test_x_matrix.reshape(n_samples, attr_num, attr_len) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) min_class = min(train_y_vector) max_class = max(train_y_vector) + 1 for c in range(min_class, max_class): logger.info("Class: " + str(c)) temp_train_y_vector = np.where(train_y_vector == c, 1, 0) temp_test_y_vector = np.where(test_y_vector == c, 1, 0) top_features = fixed_width_forward_multitime(train_x_matrix, temp_train_y_vector, test_x_matrix, temp_test_y_vector, n_selected_features, keep_k, data_keyword, file_key, method, cnn_setting_file, logger) logger.info("Top Features For Class " +str(c) + ": " + str(top_features)) logger.info("End Of Class: " + str(c))
def multi_projected_cnn_classification_main(parameter_file, file_keyword, function_keyword="multi_proj_classification"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, cnn_obj_folder, cnn_temp_folder, cnn_setting_file = read_feature_classification(parameter_file, function_keyword) obj_keyword = obj_folder.split('/')[-2] model_saved_folder = "../../object/" + data_keyword + "/projected_classification/" + obj_keyword + "_top" + str(top_k) + "_cnn_model_folder/" print obj_keyword print cnn_obj_folder print model_saved_folder top_keyword = "_top" + str(top_k) + "." group_all = False log_folder = init_folder(log_folder) #cnn_obj_folder = init_folder(cnn_obj_folder) #cnn_temp_folder = init_folder(cnn_temp_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) obj_list = list_files(obj_folder) file_count = 0 class_column = 0 header = True cnn_setting = return_cnn_setting_from_file(cnn_setting_file) cnn_setting.save_obj_folder = cnn_obj_folder cnn_setting.temp_obj_folder = cnn_temp_folder cnn_setting.eval_method = 'f1' #init_folder(cnn_obj_folder) #init_folder(cnn_temp_folder) save_obj_folder = "../../object/" + data_keyword + "/" + function_keyword + "/" + obj_keyword + "/" save_obj_folder = init_folder(save_obj_folder) delimiter = ' ' loop_count = -1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str(class_id) + '_top' + str(top_k) + '_' + method + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) logger.info('method: ' + method) logger.info('============') found_obj_file = '' for obj_file in obj_list: if file_key in obj_file: found_obj_file = obj_file break if found_obj_file == '': raise Exception('No obj file found') # found_obj_file = obj_folder + found_obj_file feature_dict = load_obj(found_obj_file)[0] feature_dict = np.array(feature_dict) logger.info("feature array shape: " + str(feature_dict.shape)) test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) logger.info("topk: " + str(top_k) ) data_stru.attr_num = top_k fold_accuracy, fold_f1_list, fold_load_time, fold_test_time = run_load_predict_cnn(file_key, model_saved_folder, feature_dict, top_k, test_x_matrix, test_y_vector, data_stru, cnn_setting, group_all, save_obj_folder, logger) logger.info("Fold ACC: " + str(fold_accuracy)) logger.info("Fold F1 list: " + str(fold_f1_list)) logger.info(method + ' fold training time (sec):' + str(fold_load_time)) logger.info(method + ' fold testing time (sec):' + str(fold_test_time))
def global_classification_main(parameter_file, file_keyword): function_keyword = "global_classification" data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, cnn_obj_folder, cnn_temp_folder, cnn_setting_file = read_feature_classification( parameter_file, function_keyword) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) obj_list = list_files(obj_folder) file_count = 0 class_column = 0 header = True cnn_setting = return_cnn_setting_from_file(cnn_setting_file) cnn_setting.save_obj_folder = cnn_obj_folder cnn_setting.temp_obj_folder = cnn_temp_folder cnn_setting.eval_method = 'f1' init_folder(cnn_obj_folder) init_folder(cnn_temp_folder) all_result_matrix = np.zeros((10, num_classes)) train_file_vector = [] prediction_matrix = [] f1_value_matrix = [] accuracy_vector = [] delimiter = ' ' all_accuracy = 0 all_train_time = 0 all_test_time = 0 loop_count = -1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( class_id) + '_top' + str(top_k) + '_' + method + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) logger.info('method: ' + method) logger.info('============') continue found_obj_file = '' for obj_file in obj_list: if file_key in obj_file: found_obj_file = obj_file break if found_obj_file == '': raise Exception('No obj file found') print found_obj_file print cnn_setting.save_obj_folder + file_key + "_" + method + "_projected_result.ckpt" # found_obj_file = obj_folder + found_obj_file feature_dict = load_obj(found_obj_file)[0] feature_dict = np.array(feature_dict) logger.info("feature array shape: " + str(feature_dict.shape)) test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) data_stru.attr_num = top_k fold_accuracy, fold_avg_eval, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix = run_feature_projected_cnn( train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, data_stru, cnn_setting, feature_dict, top_k, file_key + '_count' + str(file_count), class_id, logger) prediction_matrix.append(fold_predict_y) logger.info("Fold F1: " + str(fold_f1_value_list)) accuracy_vector.append(fold_accuracy) all_accuracy = all_accuracy + fold_accuracy all_train_time = all_train_time + fold_train_time all_test_time = all_test_time + fold_test_time logger.info(method + ' fold accuracy: ' + str(fold_accuracy)) logger.info(method + ' fold training time (sec):' + str(fold_train_time)) logger.info(method + ' fold testing time (sec):' + str(fold_test_time)) save_obj([ fold_accuracy, fold_avg_eval, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix ], save_obj_folder + file_key + "_" + method + "_global_cnn_result.ckpt")