def project_cnn_feature_combined_cpca_analysis(feature_matrix, y_vector, logger=None): if logger is None: logger = setup_logger('') threshold = 0.9 feature_matrix = np.squeeze(feature_matrix) num_instance, num_attribute, num_map = feature_matrix.shape predict = True rf_time = 0 lda_time = 0 map_attr_imp_matrix = [] success_count = 0 label_index = np.where(y_vector == 1)[0] label_x_matrix = feature_matrix[label_index, :, :] logger.info("x matrix tran before shape: " + str(label_x_matrix.shape)) start_time = time.time() cpca_matrix = computeDCPC(label_x_matrix, threshold) attr_score = clever_rank(cpca_matrix, logger) sorted_dict = sorted(attr_score.items(), key=operator.itemgetter(1), reverse=True) sorted_attr = [] for item in sorted_dict: sorted_attr.append(item[0]) run_time = time.time() - start_time print cpca_matrix.shape print sorted_attr return sorted_attr, run_time
def map_attr_imp_analysis(map_attr_imp_matrix, logger=None): if logger is None: logger = setup_logger('') logger.info(map_attr_imp_matrix.shape) if map_attr_imp_matrix.ndim == 1: attr_imp = map_attr_imp_matrix sort_index = np.argsort(attr_imp) imp_value = 0 norm_imp = np.zeros(len(attr_imp)) for index in sort_index: norm_imp[index] = imp_value imp_value = imp_value + 1 return np.argsort(norm_imp)[::-1] num_map, num_attr = map_attr_imp_matrix.shape norm_all_imp = [] for i in range(0, num_map): attr_imp = map_attr_imp_matrix[i, :] sort_index = np.argsort(attr_imp) imp_value = 0 norm_imp = np.zeros(len(attr_imp)) for index in sort_index: norm_imp[index] = imp_value imp_value = imp_value + 1 if len(norm_all_imp) == 0: norm_all_imp = norm_imp else: norm_all_imp = norm_all_imp + norm_imp return np.argsort(norm_all_imp)[::-1]
def run_nn(train_x_matrix, train_y_matrix, test_x_matrix, test_y_matrix, nn_setting, logger=None): if logger is None: logger = setup_logger('') x_row, x_col = train_x_matrix.shape y_row, y_col = train_y_matrix.shape num_classes = y_col train_x_placeholder, train_y_placeholder, logits_out, keep_prob_placeholder = configure_nn(x_col, num_classes, nn_setting) best_eval_value, train_run_time, test_run_time, nn_predict_proba = nn_train(train_x_matrix, train_y_matrix, test_x_matrix, test_y_matrix, train_x_placeholder, train_y_placeholder, logits_out, keep_prob_placeholder, nn_setting, logger) return best_eval_value, train_run_time, test_run_time, nn_predict_proba
def configure_nn(train_x_col, num_classes, nn_setting, logger=None): if logger is None: logger = setup_logger('') std_value = nn_setting.std_value tf.reset_default_graph() tf.random.set_random_seed(0) train_x_placeholder = tf.placeholder(tf.float32, [None, train_x_col]) train_y_placeholder = tf.placeholder(tf.float32, [None, num_classes]) layer_out_matrix, layer_iter, keep_prob_placeholder = conf_nn_layers(train_x_col, train_x_placeholder, nn_setting, logger) logits_out = conf_nn_out(layer_out_matrix, num_classes, std_value, layer_iter) return train_x_placeholder, train_y_placeholder, logits_out, keep_prob_placeholder
def project_cnn_feature_combined_rf_lda_analysis_with_highest( feature_matrix, y_vector, logger=None, rf_estimator=50): if logger is None: logger = setup_logger('') feature_matrix = np.squeeze(feature_matrix) num_instance, num_attribute, num_map = feature_matrix.shape predict = True skip_count = 0 rf_time = 0 lda_time = 0 keep_avg_acc = -1 map_attr_imp_matrix = [] success_count = 0 for i in range(0, num_map): map_feature_matrix = feature_matrix[:, :, i] if np.any(map_feature_matrix) is False: skip_count = skip_count + 1 continue rf_lda_avg_acc = 0 #feature_value_vector = [] start_time = 0 rf_feature_value_vector, rf_model, rf_f1_value, rf_run_time = rf_feature_extraction( map_feature_matrix, y_vector, predict, logger, rf_estimator) rf_time = rf_time + time.time() - start_time if rf_model is not None: rf_lda_avg_acc = rf_feature_value_vector feature_value_vector = rf_feature_value_vector * rf_f1_value start_time = 0 lda_feature_vector, lda_model, lad_averaged_acc, lad_run_time = lda_feature_extraction( map_feature_matrix, y_vector, predict, logger) lda_time = lda_time + time.time() - start_time if lda_model is not None: rf_lda_avg_acc = rf_lda_avg_acc + lda_feature_vector success_count = success_count + 1 #rf_lda_avg_acc = rf_f1_value + lad_averaged_acc if keep_avg_acc < rf_lda_avg_acc: keep_avg_acc = rf_lda_avg_acc feature_value_vector = feature_value_vector + lda_feature_vector * lad_averaged_acc map_attr_imp_matrix = [feature_value_vector] elif keep_avg_acc == rf_lda_avg_acc: map_attr_imp_matrix.append(feature_value_vector) map_attr_imp_matrix = np.array(map_attr_imp_matrix) logger.info("success count: " + str(success_count)) logger.info("highest acc:" + str(keep_avg_acc)) return map_attr_imp_matrix, rf_time + lda_time
def conf_nn_layers(train_x_col, input_placeholder, nn_setting, logger=None): if logger is None: logger = setup_logger('') layer_list = nn_setting.layer_list std_value = nn_setting.std_value layer_out = input_placeholder layer_iter = 0 layer_input = train_x_col keep_prob_placeholder = tf.placeholder(tf.float32) for neurons in layer_list: weight_name = "weight_" + str(layer_iter) bias_name = "bias_" + str(layer_iter) weight = tf.Variable(tf.random_normal([layer_input, neurons], stddev=std_value, seed=layer_iter), name=weight_name) bias = tf.Variable(tf.zeros([neurons]), name=bias_name) layer_input = neurons hidden_out = tf.add(tf.matmul(layer_out, weight), bias) layer_out = tf.nn.relu(hidden_out) layer_out = tf.nn.dropout(layer_out, keep_prob_placeholder) layer_iter = layer_iter + 1 return layer_out, layer_iter, keep_prob_placeholder
def project_cnn_feature_combined_lda_analysis(feature_matrix, y_vector, logger=None, rf_estimator=50): if logger is None: logger = setup_logger('') feature_matrix = np.squeeze(feature_matrix) num_instance, num_attribute, num_map = feature_matrix.shape predict = True skip_count = 0 rf_time = 0 lda_time = 0 map_attr_imp_matrix = [] success_count = 0 feature_matrix_2d = feature_matrix.reshape(num_instance, num_attribute * num_map) start_time = 0 lda_feature_value_vector, lda_model, lda_f1_value, lda_run_time = lda_feature_extraction( feature_matrix_2d, y_vector, predict, logger) lda_time = time.time() - start_time lda_feature_value_vector = lda_feature_value_vector.reshape( num_attribute, num_map) lda_feature_value_vector = np.sum(lda_feature_value_vector, axis=1) sum_value = sum(lda_feature_value_vector) lda_feature_value_vector = lda_feature_value_vector / float(sum_value) lda_class_attr_list = map_attr_imp_analysis(lda_feature_value_vector, logger) logger.info("lda feature value: " + str(lda_feature_value_vector.shape)) logger.info("lda f1 value: " + str(lda_f1_value)) logger.info("lda only attr:" + str(lda_class_attr_list)) feature_value_vector = lda_feature_value_vector return feature_value_vector, rf_time + lda_time
def run_pure_pv_evaluation( file_keyword, parameter_file='../../parameters/pv_baseline_evaluation.txt', function_keyword="pure_pv_evaluation"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, method, log_folder, out_obj_folder = read_pure_feature_generation( parameter_file, function_keyword) print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, method, log_folder, out_obj_folder file_list = list_files(data_folder) file_count = 0 for train_file in file_list: if file_keyword not in train_file: continue train_key = train_file.replace('.txt', '') file_count = file_count + 1 data_matrix, attr_num = file_reading(data_folder + train_file) train_x_matrix, train_y_vector = x_y_spliting(data_matrix, class_column) train_row, train_col = train_x_matrix.shape train_x_matrix = train_x_matrix.reshape(train_row, attr_num, attr_len) if class_id < 0: min_class = min(train_y_vector) max_class = max(train_y_vector) + 1 else: min_class = class_id max_class = min_class + 1 log_file = train_key + "_" + method + "_min" + str( min_class) + "_max" + str(max_class) + "_pure_projected.log" #logger = setup_logger('') logger = setup_logger(log_folder + log_file) print "log file: " + log_folder + log_file logger.info(train_file) out_obj_file = train_key + "_" + method + "_min" + str( min_class) + "_max" + str(max_class) + "_pure_projected.obj" out_obj_matrix = [] logger.info("min class: " + str(min_class)) logger.info("max class: " + str(max_class)) for label in range(min_class, max_class): class_train_y = np.where(train_y_vector == label, 1, 0) logger.info("label: " + str(label)) if method == 'rf_lda': class_attr_imp_matrix, class_run_time = project_cnn_feature_combined_rf_lda_analysis( train_x_matrix, class_train_y, logger) elif method == "rf": class_attr_imp_matrix, class_run_time = project_cnn_feature_combined_rf_analysis( train_x_matrix, class_train_y, logger) elif method == "lda": class_attr_imp_matrix, class_run_time = project_cnn_feature_combined_lda_analysis( train_x_matrix, class_train_y, logger) logger.info("class attr imp matrix shape: " + str(class_attr_imp_matrix.shape)) class_attr_list = map_attr_imp_analysis(class_attr_imp_matrix, logger) logger.info(class_attr_list) logger.info(class_attr_list.shape) out_obj_matrix.append(class_attr_list) out_obj_matrix = np.array(out_obj_matrix) logger.info("out obj to: " + out_obj_folder + out_obj_file) logger.info(out_obj_matrix.shape) save_obj([out_obj_matrix], out_obj_folder + out_obj_file)
def backward_multitime( train_x, train_y, test_x, test_y, n_selected_features, data_key="test", method="cnn", cnn_setting_file="../../parameters/cnn_model_parameter.txt", logger=None): """ This function implements the backward feature selection algorithm based on decision tree Input ----- train_x: {3d numpy array matrix}, shape (n_samples, n_features, time_length) input data train_y: {1d numpy array vector}, shape (n_samples,) input class labels test_x: {3d numpy array matrix}, shape (n_samples, n_features, time_length) input data test_y: {1d numpy array vector}, shape (n_samples,) input class labels Output ------ F: {numpy array}, shape (n_features, ) index of selected features """ if logger is None: log_file = "" logger = setup_logger(log_file) train_samples, n_features, time_length = train_x.shape f_score = [] eval_method = "f1" if method == "cnn": min_class = min(train_y) max_class = max(train_y) num_classes = max_class - min_class + 1 data_stru = data_structure(num_classes, min_class, n_features, time_length) cnn_setting = return_cnn_setting_from_file(cnn_setting_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) saver_file_profix = "../../object/" + data_key + "/backward_multitime/" + method saver_file_profix = init_folder(saver_file_profix) saver_file_profix = saver_file_profix + return_cnn_keyword(cnn_setting) eval_method = cnn_setting.eval_method all_f_eval_value, all_f_train_time, all_f_test_time, predict_proba, saver_file, feature_list_obj_file, relu_base_array = model_evaluation_cnn( train_x, train_y, test_x, test_y, data_stru, cnn_setting, saver_file_profix, logger) elif method == "rf": model = RandomForestClassifier(n_estimators=50, random_state=0) all_f_eval_value, all_f_train_time, all_f_test_time = model_evaluation_rf( train_x, train_y, test_x, test_y, model, logger) logger.info("With ALL Feature") logger.info(method + " " + eval_method + " Value For ALL Feature: " + str(all_f_eval_value)) logger.info(method + " Training time (sec): " + str(all_f_train_time)) logger.info(method + " Testing time (sec): " + str(all_f_test_time)) # selected feature set, initialized to contain all features F = range(n_features) count = n_features iter_num = 0 while count > n_selected_features: max_eval_value = -1 for i in range(n_features): if i in F: F.remove(i) train_x_tmp = train_x[:, F, :] test_x_tmp = test_x[:, F, :] if method == "cnn": eval_value, train_run_time, test_run_time, predict_proba, saver_file, feature_list_obj_file, relu_based_array = model_evaluation_cnn( train_x_tmp, train_y, test_x_tmp, test_y, data_stru, cnn_setting, saver_file_profix, logger) f_eval_value = all_f_eval_value - eval_value elif method == "rf": eval_value, train_run_time, test_run_time = model_evaluation_rf( train_x_tmp, train_y, test_x_tmp, test_y, model, logger) f_eval_value = all_f_eval_value - eval_value logger.info("Without Feature " + str(i) + ": ") logger.info(method + eval_method + " Value For Feature " + str(i) + ": " + str(f_eval_value)) logger.info(method + " Training time (sec): " + str(train_run_time)) logger.info(method + " Testing time (sec): " + str(test_run_time)) f_score.append(f_eval_value) F.append(i) # record the feature which results in the largest accuracy if eval_value > max_eval_value: max_eval_value = eval_value idx = i logger.info("For iter " + str(iter_num)) logger.info("Eval score vector: " + str(f_score)) logger.info("The removed attribute is: " + str(idx)) # delete the feature which results in the largest accuracy F.remove(idx) count -= 1 iter_num = iter_num + 1 return np.array(F)
def backward_multitime_main(parameter_file="../../parameters/", file_keyword="train_", n_selected_features=15): function_keyword = "backward_wrapper" data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_all_feature_classification( parameter_file, function_keyword) print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file log_folder = init_folder(log_folder) out_obj_folder = init_folder(out_obj_folder) out_model_folder = init_folder(out_model_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) file_count = 0 class_column = 0 header = True delimiter = ' ' loop_count = -1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( class_id) + '_' + method + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('method: ' + method) logger.info('============') test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector = train_test_file_reading( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) n_samples, n_col = train_x_matrix.shape train_x_matrix = train_x_matrix.reshape(n_samples, attr_num, attr_len) n_samples, n_col = test_x_matrix.shape test_x_matrix = test_x_matrix.reshape(n_samples, attr_num, attr_len) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) if class_id == -1: min_class = min(train_y_vector) max_class = max(train_y_vector) + 1 else: min_class = class_id max_class = class_id + 1 for c in range(min_class, max_class): logger.info("Class: " + str(c)) temp_train_y_vector = np.where(train_y_vector == c, 1, 0) temp_test_y_vector = np.where(test_y_vector == c, 1, 0) top_features = backward_multitime( train_x_matrix, temp_train_y_vector, test_x_matrix, temp_test_y_vector, n_selected_features, data_keyword, method, cnn_setting_file, logger) logger.info("Top Features For Class " + str(c) + ": " + str(top_features)) logger.info("End Of Class: " + str(c))
def min_distance_shapelet_series(series_matrix, shapelet_matrix, mask_matrix, series_keep_len, logger=None): if logger is None: logger = setup_logger('') rs = npr.RandomState(0) mask_matrix = np.maximum(0, mask_matrix) num_series, attr_num, attr_len = series_matrix.shape num_shap, attr_num, shap_max = shapelet_matrix.shape ret_a_matrix = [] for serie_iter in range(0, num_series): ret_a_vector = [] #logger.info("series iter: " + str(serie_iter)) keep_len = series_keep_len[serie_iter] #print series_matrix[serie_iter, :, 0:keep_len] series = series_matrix[serie_iter, :, 0:keep_len] #logger.info(series.shape) for shap_iter in range(0, num_shap): #logger.info("shap iter: " + str(shap_iter)) attr_dist = 0 for attr in range(attr_num): attr_mask = mask_matrix[shap_iter, attr] attr_shapelet = shapelet_matrix[shap_iter, attr, :] shap_len = len(attr_shapelet) attr_series = series[attr, :] attr_series_len = len(attr_series) attr_shapelet = attr_shapelet.reshape(1, shap_len) if attr_series_len < shap_len: shap_len = attr_series_len loop_count = 1 else: loop_count = attr_series_len - shap_len + 1 #print "loop_count: " + str(loop_count) min_euclidean = 0 start = 0 end = start + shap_len min_euclidean = np.sum( np.square(attr_shapelet - attr_series[start:end])) for start in range(1, loop_count): end = start + shap_len euclidean = np.sum( np.square(attr_shapelet - attr_series[start:end])) if euclidean < min_euclidean: min_euclidean = euclidean #print "===" min_euclidean = min_euclidean / float(shap_len) #print min_euclidean min_euclidean = min_euclidean * attr_mask attr_dist = attr_dist + min_euclidean #print min_euclidean #print attr_mask #print attr_dist attr_dist = attr_dist / float(attr_num) #print serie_iter #print shap_iter #print "attr dist" #print attr_dist ret_a_vector.append(attr_dist) #ret_a_matrix[serie_iter, shap_iter] = attr_dist[0] ret_a_matrix.append(ret_a_vector) ret_a_matrix = np.array(ret_a_matrix) # print ret_a_matrix.shape return ret_a_matrix
if std > 1.00001 or std < 0.99999: return False return True if __name__ == '__main__': argv_array = sys.argv run_stdout = sys.stdout file_keyword = 'train_' len_argv_array = len(argv_array) if len_argv_array > 1: try: val = int(argv_array[1]) file_keyword = file_keyword + argv_array[1] except ValueError: print("That's not an int!") #data_matrix = np.random.rand(3, 2, 5) #print data_matrix #print run_z_normalization(data_matrix).shape data_key = 'dsa' data_key = 'rar' data_key = 'arc' data_key = 'arabic' data_folder = "../../data/" + data_key + "/train_test_1_fold/" log_folder = "../../log/" + data_key log_folder = init_folder(log_folder) log_file = log_folder + data_key + "_z_norm.log" logger = setup_logger(log_file) run_z_norm_main(data_folder, file_keyword, logger)
def run_channel_mask_main(data_folder, log_folder, obj_folder, shap_k=10, shap_min=2, shap_max=3, file_key="train_", fun_key="_mask_gene"): file_list = list_files(data_folder) file_count = 0 for train_file in file_list: if file_key not in train_file: continue this_keyword = train_file.replace('.txt', '') log_file = this_keyword + fun_key + "_shapNum" + str( shap_k) + "_shapMin" + str(shap_min) + "_shapMax" + str( shap_max) + "_all_class.log" out_obj_file = this_keyword + fun_key + "_shapNum" + str( shap_k) + "_shapMin" + str(shap_min) + "_shapMax" + str(shap_max) logger = setup_logger(log_folder + log_file) print "log file: " + log_folder + log_file print "obj file: " + obj_folder + out_obj_file logger.info(log_folder + log_file) out_obj_dict = {} file_count = file_count + 1 test_file = train_file.replace('train_', 'test_') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum( data_folder + train_file, data_folder + test_file) train_row, train_col = train_x_matrix.shape test_row, test_col = test_x_matrix.shape attr_len = train_col / attr_num train_x_matrix = train_x_matrix.reshape(train_row, attr_num, attr_len) test_x_matrix = test_x_matrix.reshape(test_row, attr_num, attr_len) logger.info("train x matrix: " + str(train_x_matrix.shape)) logger.info("test x matrix: " + str(test_x_matrix.shape)) train_keep_len = matrix_keep_len_gene(train_x_matrix) test_keep_len = matrix_keep_len_gene(test_x_matrix) min_class = min(train_y_vector) max_class = max(train_y_vector) + 1 num_classes = max_class - min_class logger.info("x matrix tran after shape: " + str(train_x_matrix.shape)) for label in range(min_class, max_class): label = max_class - label - 1 label_train_y_vector = np.where(train_y_vector == label, 1, 0) label_test_y_vector = np.where(test_y_vector == label, 1, 0) label_train_y_matrix = y_vector_to_matrix(label_train_y_vector, 2) label_test_y_matrix = y_vector_to_matrix(label_test_y_vector, 2) logger.info("class: " + str(label)) test_eval_value, mask_value = run_channel_mask( train_x_matrix, label_train_y_matrix, train_keep_len, test_x_matrix, label_test_y_matrix, test_keep_len, shap_k, shap_min, shap_max, logger) logger.info("final for class " + str(label)) logger.info("final acc: " + str(test_eval_value)) logger.info("final mask: " + str(mask_value.shape)) logger.info("out obj saved to " + obj_folder + out_obj_file + "_class" + str(label) + ".obj") save_obj([mask_value], obj_folder + out_obj_file + "_class" + str(label) + ".obj")
def mask_evaluation_main(log_folder, obj_folder, out_obj_folder, obj_keyword, shap_k=-1, shap_min=-1, shap_max=-1, func_key="arxiv_mask_gene"): log_folder = log_folder + func_key log_folder = init_folder(log_folder) log_file = obj_keyword + "_allclass_" + func_key + ".log" #logger = setup_logger('') logger = setup_logger(log_folder + log_file) logger.info("log folder: " + log_folder) logger.info("obj folder: " + obj_folder) obj_file_list = list_files(obj_folder) if shap_k != -1: obj_sec_key = "shapNum" + str(shap_k) + "_shapMin" + str( shap_min) + "_shapMax" + str(shap_max) else: obj_sec_key = ".obj" min_class = 100 max_class = -1 output_array = [] for obj_file in obj_file_list: if obj_keyword not in obj_file: continue if "_class" not in obj_file: continue if obj_sec_key not in obj_file: continue class_key = obj_file.split('_')[-1] class_key = class_key.replace('class', '').replace('.obj', '') logger.info("obj file:" + obj_file) logger.info("class key: " + class_key) class_key = int(class_key) if min_class > class_key: min_class = class_key if max_class < class_key: max_class = class_key shap_mask = load_obj(obj_folder + obj_file)[0] if len(shap_mask) == 0: continue shap_mask = numpy.array(shap_mask) shap_mask = numpy.squeeze(shap_mask) logger.info("shap_mask shape: " + str(shap_mask.shape)) #shap_num, attr_num = shap_mask.shape shap_mask = numpy.absolute(shap_mask) shap_mask = numpy.sum(shap_mask, axis=0) logger.info(shap_mask) sort_index = numpy.argsort(shap_mask) imp_value = 0 norm_imp = numpy.zeros(len(shap_mask)) for index in sort_index: norm_imp[index] = imp_value imp_value = imp_value + 1 shap_mask_index = numpy.argsort(norm_imp)[::-1] logger.info(shap_mask_index) logger.info("====") output_array.append(shap_mask_index) logger.info("shap_mask final shape: " + str(shap_mask.shape)) output_array = numpy.array(output_array) obj_file = obj_keyword + "_min" + str(min_class) + "_max" + str( max_class) + "out.obj" logger.info("final output obj shape: " + str(output_array.shape)) logger.info(output_array) save_obj([output_array], out_obj_folder + obj_file)
def cnn_load_main(parameter_file, file_keyword, function_keyword="cnn_classification"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_all_feature_classification(parameter_file, function_keyword) print(data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file) log_folder = init_folder(log_folder) out_obj_folder = init_folder(out_obj_folder) out_model_folder = init_folder(out_model_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) file_count = 0 class_column = 0 header = True cnn_setting = return_cnn_setting_from_file(cnn_setting_file) cnn_setting.out_obj_folder = out_obj_folder cnn_setting.out_model_folder = out_model_folder cnn_setting.full_feature_num = 400 init_folder(out_obj_folder) init_folder(out_model_folder) print (out_model_folder) model_file_list = list_files(out_model_folder) result_obj_folder = obj_folder + method +"_result_folder" result_obj_folder = init_folder(result_obj_folder) logger = setup_logger('') delimiter = ' ' loop_count = -1 saver_file_profix = "" for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') saver_file_profix = file_key test_file = train_file.replace('train', 'test') #train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading(data_folder + train_file, data_folder + test_file, '', class_column, delimiter, header) data_group, attr_num = train_test_file_reading(data_folder + train_file, data_folder + test_file, '', class_column, delimiter, header) train_x_matrix = data_group.train_x_matrix train_y_vector = data_group.train_y_vector test_x_matrix = data_group.test_x_matrix test_y_vector = data_group.test_y_vector train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) train_y_matrix = y_vector_to_matrix(train_y_vector, num_classes) test_y_matrix = y_vector_to_matrix(test_y_vector, num_classes) found_model_file = "" for model_file in model_file_list: if model_file.startswith(file_key): model_file = model_file.split('.')[0] found_model_file = out_model_folder + model_file + ".ckpt" break if found_model_file == "": raise Exception("No model object file found!!!") print(found_model_file) cnn_session, logits_out, train_x_placeholder, keep_prob_placeholder, keeped_feature_list = load_model(found_model_file, data_stru, cnn_setting, logger) last_conv_tensor = keeped_feature_list[0] train_last_conv = cnn_session.run(last_conv_tensor, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) test_last_conv = cnn_session.run(last_conv_tensor, feed_dict={train_x_placeholder: test_x_matrix, keep_prob_placeholder: 1.0}) drop_num = 10 print(np.squeeze(test_last_conv[1, :, :, :])) test_last_conv = top_attr_x_matrix(test_last_conv, drop_num) print(np.squeeze(test_last_conv[1, :, :, :])) train_last_conv = top_attr_x_matrix(train_last_conv, drop_num) output_y_placeholder = tf.placeholder(tf.float32, [None, num_classes]) actual = tf.argmax(output_y_placeholder, axis=1) prediction = tf.argmax(logits_out, axis=1) correct_prediction = tf.equal(actual, prediction) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) ori_pred_y_vector = cnn_session.run(prediction, feed_dict={train_x_placeholder: test_x_matrix, keep_prob_placeholder: 1.0}) test_accuracy = cnn_session.run(accuracy, feed_dict={train_x_placeholder: test_x_matrix, keep_prob_placeholder: 1.0, output_y_placeholder: test_y_matrix}) cnn_session.close() kernel_eval_matrix, ref_kernel_eval_matrix = last_conv_analysis(train_last_conv, train_y_vector) print(kernel_eval_matrix.shape) print(kernel_eval_matrix) train_ins_len = len(train_y_vector) test_ins_len = len(test_y_vector) batch_size = 100 layer_list = np.array([400]) max_epoch = 10 stop_threshold = 0.99 activation_fun = 3 std_value = 0.02 eval_method = "acc" saver_file = './test_1.save' nn_setting = nn_parameters(layer_list, batch_size, max_epoch, stop_threshold, activation_fun, std_value, eval_method, saver_file) all_pred_prob = [] for c in range(num_classes): train_y_vector_class = np.zeros((train_ins_len)) index_class = np.where(train_y_vector==c)[0] train_y_vector_class[index_class] = 1 train_y_m_class = y_vector_to_matrix(train_y_vector_class, 2) test_y_vector_class = np.zeros((test_ins_len)) index_class = np.where(test_y_vector==c)[0] test_y_vector_class[index_class] = 1 test_y_m_class = y_vector_to_matrix(test_y_vector_class, 2) keep_num = 5 kernel_index = kernel_eval_matrix[c, 0:keep_num] ref_kernel_index = ref_kernel_eval_matrix[c, 0:keep_num] print("kernel index " + str(kernel_index)) print("ref kernel index " + str(ref_kernel_index)) kernel_index = np.concatenate((kernel_index, ref_kernel_index), axis=0) print("union index " + str(kernel_index)) kernel_index = np.unique(kernel_index) print("unique index " + str(kernel_index)) kernel_index = ref_kernel_eval_matrix[c, 0:keep_num] train_x_class = train_last_conv[:, :, :, kernel_index] test_x_class = test_last_conv[:, :, :, kernel_index] print(train_x_class.shape) reshape_col = 45 * len(kernel_index) train_x_class = train_x_class.reshape((train_ins_len, reshape_col)) test_x_class = test_x_class.reshape((test_ins_len, reshape_col)) c_eval_value, c_train_time, c_test_time, c_predict_proba = run_nn(train_x_class, train_y_m_class, test_x_class, test_y_m_class, nn_setting) all_pred_prob.append(c_predict_proba[:, 1]-c_predict_proba[:, 0]) all_pred_prob = np.array(all_pred_prob) print(all_pred_prob.shape) pred_vector = np.argmax(all_pred_prob, axis=0) print(pred_vector) print(all_pred_prob[:, 0]) print(all_pred_prob[:, 1]) print(all_pred_prob[:, 2]) final_accuracy = accuracy_score(pred_vector, test_y_vector) avg_acc, ret_str = averaged_class_based_accuracy(ori_pred_y_vector, test_y_vector) print("original avg acc" + str(avg_acc)) print("original accuracy: " + str(test_accuracy)) print(ret_str) avg_acc, ret_str = averaged_class_based_accuracy(pred_vector, test_y_vector) print("avg acc" + str(avg_acc)) print("new accuracy: " + str(final_accuracy)) print(ret_str) load_result_analysis(all_pred_prob, test_y_vector) sdfds output_y_placeholder = tf.placeholder(tf.float32, [None, num_classes]) actual = tf.argmax(output_y_placeholder, axis=1) prediction = tf.argmax(logits_out, axis=1) correct_prediction = tf.equal(actual, prediction) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) test_eval_value = accuracy.eval(feed_dict={train_x_placeholder: test_x_matrix, output_y_placeholder: test_y_matrix, keep_prob_placeholder: 1.0}) print("fisrt") print(test_eval_value) conv_count = 1 drop_ratio = 0.1 #conv_variable_up_main(cnn_session, conv_count, drop_ratio) weight_name = "conv_w_" + str(0) + ":0" bias_name = "conv_b_" + str(0) + ":0" ori_weight_variable = tf.get_default_graph().get_tensor_by_name(weight_name) ori_bias_variable = tf.get_default_graph().get_tensor_by_name(bias_name) weight_variable = tf.get_default_graph().get_tensor_by_name(weight_name) bias_variable = tf.get_default_graph().get_tensor_by_name(bias_name) ori_weight_variable = cnn_session.run(weight_variable) ori_bias_variable = cnn_session.run(bias_variable) train_drop_acc = [] test_drop_acc = [] for drop_i in range(50): drop_weight_variable = np.copy(ori_weight_variable) drop_bias_variable = np.copy(ori_bias_variable) drop_index = [] drop_index.append(drop_i) up_fir_weight, up_fir_bias = conv_variable_up(drop_weight_variable, drop_bias_variable, drop_index) weight_assign = tf.assign(weight_variable, up_fir_weight) bias_assign = tf.assign(bias_variable, up_fir_bias) cnn_session.run(weight_assign) cnn_session.run(bias_assign) up_bias_variable = tf.get_default_graph().get_tensor_by_name(bias_name) up_bias_variable_val = cnn_session.run(bias_variable) train_eval_value = accuracy.eval(feed_dict={train_x_placeholder: train_x_matrix, output_y_placeholder: train_y_matrix, keep_prob_placeholder: 1.0}) train_drop_acc.append(train_eval_value) test_eval_value = accuracy.eval(feed_dict={train_x_placeholder: test_x_matrix, output_y_placeholder: test_y_matrix, keep_prob_placeholder: 1.0}) test_drop_acc.append(test_eval_value) print ("Drop " + str(drop_i)) print(train_eval_value) print(test_eval_value) print(train_drop_acc) print(train_drop_acc.argsort()) print(test_drop_acc) print(test_drop_acc.argsort()) sdfs print("HERE") fir_weight_variable_val = np.squeeze(fir_weight_variable_val) kernel_dist_val = cnn_session.run(kernel_dist) keep_index_val = cnn_session.run(keep_index) print(fir_weight_variable_val.shape) print(np.amax(fir_weight_variable_val, axis=1)) print(np.amin(fir_weight_variable_val, axis=1)) print(np.mean(fir_weight_variable_val, axis=1)) mean_row = np.mean(fir_weight_variable_val, axis=-1) print(mean_row.shape) dist_list = [] for r in range(40): row = fir_weight_variable_val[:, r] dist_list.append(np.linalg.norm(row-mean_row)) print (dist_list) print(kernel_dist_val) print(keep_index_val) print(sorted(dist_list)) print("!!!") #conv_variable_up(fir_weight_variable_val, fir_bias_variable_val) sdfsd train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading(data_folder + train_file, data_folder + test_file, class_column, delimiter, header) train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) train_x_matrix = test_x_matrix[0:1, :, :, :] #plot_2dmatrix(np.squeeze(train_x_matrix)[:, 0:5]) fir_out_tensor = tf.nn.conv2d(train_x_placeholder, fir_weight_variable, strides=[1, 1, 1, 1], padding='VALID') + fir_bias_variable fir_out_tensor = tf.nn.relu(fir_out_tensor) print(fir_out_tensor.get_shape()) fir_analysis_tensor = tf.reduce_max(fir_out_tensor, [1]) print(fir_analysis_tensor.get_shape()) fir_analysis_tensor = tf.reduce_max(fir_analysis_tensor, [1]) fir_analysis_tensor = tf.reduce_mean(fir_analysis_tensor, [0]) top_k_indices = tf.nn.top_k(fir_analysis_tensor, 10).indices top_k_values = tf.nn.top_k(fir_analysis_tensor, 10).values top_fir_out_tensor = tf.gather(fir_out_tensor, top_k_indices, axis=3) sec_weight_variable = tf.get_default_graph().get_tensor_by_name("conv_w_1:0") sec_bias_variable = tf.get_default_graph().get_tensor_by_name("conv_b_1:0") sec_out_tensor = tf.nn.conv2d(fir_out_tensor, sec_weight_variable, strides=[1, 1, 1, 1], padding='VALID') + sec_bias_variable sec_out_tensor = tf.nn.relu(sec_out_tensor) sec_weight_var_val = cnn_session.run(sec_weight_variable) #print(np.squeeze(sec_weight_var_val)) #sdfds #plot_2dmatrix(fir_weight_var_val[:, 4]) #sdf #print(fir_weight_var_val.T) fir_out_tensor_val = cnn_session.run(fir_out_tensor, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) print(fir_out_tensor_val.shape) top_fir_out_tensor = cnn_session.run(top_fir_out_tensor, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) print(top_fir_out_tensor.shape) fir_analysis_tensor_val = cnn_session.run(fir_analysis_tensor, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) print(fir_analysis_tensor.shape) top_k_indices_val = cnn_session.run(top_k_indices, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) top_k_values_val = cnn_session.run(top_k_values, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) fir_weight_variable_val = cnn_session.run(fir_weight_variable) fir_weight_variable_val = np.squeeze(fir_weight_variable_val) print(fir_weight_variable_val.shape) print(fir_analysis_tensor_val) fir_sort_in = np.argsort(fir_analysis_tensor_val) print(fir_sort_in) print(top_k_indices_val) print(top_k_values_val) plot_2dmatrix(fir_weight_variable_val[:, fir_sort_in[-10:]]) sdfd for n in range(len(fir_out_tensor_val)): for k in range(50): ret_str = "k" + str(k) + ": " kernel_max = -1 max_attr = -1 max_attr_list = [] for a in range(attr_num): attr_max = max(fir_out_tensor_val[n, :, a, k]) max_attr_list.append(attr_max) if attr_max > kernel_max: kernel_max = attr_max max_attr = a if attr_max == 0: ret_str = ret_str + str(a) + " " print(ret_str) print("max attr " + str(max_attr)) print(sorted(range(len(max_attr_list)), key=lambda k: max_attr_list[k])) print("======") print("label " + str(train_y_vector[0])) fir_out_tensor_val = cnn_session.run(sec_out_tensor, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) print(fir_out_tensor_val.shape) sdf for n in range(len(fir_out_tensor_val)): for k in range(40): ret_str = "k" + str(k) + ": " kernel_max = -1 max_attr = -1 max_attr_list = [] for a in range(attr_num): attr_max = max(fir_out_tensor_val[n, :, a, k]) max_attr_list.append(attr_max) if attr_max > kernel_max: kernel_max = attr_max max_attr = a if attr_max == 0: ret_str = ret_str + str(a) + " " print(ret_str) print("max attr " + str(max_attr)) print(sorted(range(len(max_attr_list)), key=lambda k: max_attr_list[k])) print("======") sdf fir_out_mean_val = cnn_session.run(fir_out_mean, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) #fir_out_mean_val = np.squeeze(fir_out_mean_val) print(fir_out_mean_val.shape) plot_2dmatrix(np.squeeze(fir_out_mean_val[:, :, 0:5])) sdfd plot_2dmatrix(fir_weight_var_val) min_class = min(train_y_vector) max_class = max(train_y_vector) num_classes = max_class - min_class + 1 if cnn_setting.eval_method == "accuracy": cnn_eval_key = "acc" elif num_classes > 2: cnn_eval_key = "acc_batch" else: cnn_eval_key = "f1" log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str(min_class)+"_" + str(max_class) + "_act" + str(cnn_setting.activation_fun) + "_" + cnn_eval_key + '.log' print("log file: " + log_file) logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) logger.info('method: ' + method) logger.info('============') train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) logger.info(train_x_matrix[0, 0:3, 0:2, 0]) logger.info(test_x_matrix[0, 0:3, 0:2, 0]) train_y_matrix = y_vector_to_matrix(train_y_vector, num_classes) test_y_matrix = y_vector_to_matrix(test_y_vector, num_classes) cnn_eval_value, train_run_time, test_run_time, cnn_predict_proba, saver_file, feature_list_obj_file = run_cnn(train_x_matrix, train_y_matrix, test_x_matrix, test_y_matrix, data_stru, cnn_setting, saver_file_profix, logger) logger.info("Fold eval value: " + str(cnn_eval_value)) logger.info(method + ' fold training time (sec):' + str(train_run_time)) logger.info(method + ' fold testing time (sec):' + str(test_run_time)) logger.info("save obj to " + saver_file)
def multi_projected_cnn_classification_main(parameter_file, file_keyword, function_keyword="multi_proj_classification"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, cnn_obj_folder, cnn_temp_folder, cnn_setting_file = read_feature_classification(parameter_file, function_keyword) obj_keyword = obj_folder.split('/')[-2] model_saved_folder = "../../object/" + data_keyword + "/projected_classification/" + obj_keyword + "_top" + str(top_k) + "_cnn_model_folder/" print obj_keyword print cnn_obj_folder print model_saved_folder top_keyword = "_top" + str(top_k) + "." group_all = False log_folder = init_folder(log_folder) #cnn_obj_folder = init_folder(cnn_obj_folder) #cnn_temp_folder = init_folder(cnn_temp_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) obj_list = list_files(obj_folder) file_count = 0 class_column = 0 header = True cnn_setting = return_cnn_setting_from_file(cnn_setting_file) cnn_setting.save_obj_folder = cnn_obj_folder cnn_setting.temp_obj_folder = cnn_temp_folder cnn_setting.eval_method = 'f1' #init_folder(cnn_obj_folder) #init_folder(cnn_temp_folder) save_obj_folder = "../../object/" + data_keyword + "/" + function_keyword + "/" + obj_keyword + "/" save_obj_folder = init_folder(save_obj_folder) delimiter = ' ' loop_count = -1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str(class_id) + '_top' + str(top_k) + '_' + method + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) logger.info('method: ' + method) logger.info('============') found_obj_file = '' for obj_file in obj_list: if file_key in obj_file: found_obj_file = obj_file break if found_obj_file == '': raise Exception('No obj file found') # found_obj_file = obj_folder + found_obj_file feature_dict = load_obj(found_obj_file)[0] feature_dict = np.array(feature_dict) logger.info("feature array shape: " + str(feature_dict.shape)) test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) logger.info("topk: " + str(top_k) ) data_stru.attr_num = top_k fold_accuracy, fold_f1_list, fold_load_time, fold_test_time = run_load_predict_cnn(file_key, model_saved_folder, feature_dict, top_k, test_x_matrix, test_y_vector, data_stru, cnn_setting, group_all, save_obj_folder, logger) logger.info("Fold ACC: " + str(fold_accuracy)) logger.info("Fold F1 list: " + str(fold_f1_list)) logger.info(method + ' fold training time (sec):' + str(fold_load_time)) logger.info(method + ' fold testing time (sec):' + str(fold_test_time))
def forward_multitime_main(parameter_file="../../parameters/", file_keyword="train_"): function_keyword = "forward_wrapper" #data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_feature_classification(parameter_file, function_keyword) data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_feature_classification( parameter_file, function_keyword) print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file if data_keyword == "dsa" or data_keyword == "toy": n_selected_features = 15 num_classes = 19 elif data_keyword == "rar": n_selected_features = 30 num_classes = 33 elif data_keyword == "arc" or data_keyword == "fixed_arc": n_selected_features = 30 num_classes = 18 elif data_keyword == "asl": n_selected_features = 6 num_classes = 95 else: raise Exception("Please fullfill the data basic information first!") log_folder = init_folder(log_folder) #out_obj_folder = init_folder(out_obj_folder) #out_model_folder = init_folder(out_model_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) file_count = 0 class_column = 0 header = True delimiter = ' ' loop_count = -1 ########## ###already remove later #already_obj_folder = "../../object/" + data_keyword + "/forward_wrapper/" #already_obj_list = list_files(already_obj_folder) ###end of already remove later for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') #already_obj_file = "" already = False #for already_obj_file in already_obj_list: # if file_key in already_obj_file and method in already_obj_file: # already = True # break ########## ###already part #if already is True: # already_class_feature = load_obj(already_obj_folder + already_obj_file)[0] #else: # log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str(class_id) + '_' + method + '.log' # already_class_feature = None ###end of already part log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( class_id) + '_' + method + "_top" + str( n_selected_features) + '_already' + str(already) + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('method: ' + method) logger.info('============') test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector = train_test_file_reading( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) n_samples, n_col = train_x_matrix.shape train_x_matrix = train_x_matrix.reshape(n_samples, attr_num, attr_len) n_samples, n_col = test_x_matrix.shape test_x_matrix = test_x_matrix.reshape(n_samples, attr_num, attr_len) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) min_class = min(train_y_vector) max_class = max(train_y_vector) + 1 for c in range(min_class, max_class): logger.info("Class: " + str(c)) already_feature = [] #if already_class_feature is not None: # class_already = already_class_feature[c, :] # for already_f in class_already: # already_feature.append(already_f) # logger.info("already features: " +file_key + " with class " + str(c) + ": " + str(already_feature)) temp_train_y_vector = np.where(train_y_vector == c, 1, 0) temp_test_y_vector = np.where(test_y_vector == c, 1, 0) #print already_feature top_features = forward_multitime( train_x_matrix, temp_train_y_vector, test_x_matrix, temp_test_y_vector, n_selected_features, data_keyword, file_key, method, cnn_setting_file, logger, already_feature) logger.info("Top Features For Class " + str(c) + ": " + str(top_features)) logger.info("End Of Class: " + str(c))
def best_forward_multitime_main(parameter_file="../../parameters/", file_keyword="train_", function_keyword="best_forward_multitime"): #data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_all_feature_classification(parameter_file, function_keyword) data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_feature_classification(parameter_file, function_keyword) print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file function_keyword = function_keyword + "_" + method if data_keyword == "dsa" or data_keyword == "toy": n_selected_features = 15 num_classes = 19 elif data_keyword == "rar": n_selected_features = 30 num_classes = 33 elif data_keyword == "arc" or data_keyword == "fixed_arc": n_selected_features = 30 num_classes = 18 elif data_keyword == "asl": n_selected_features = 6 num_classes = 95 else: raise Exception("Please fullfill the data basic information first!") keep_k = 5 log_folder = init_folder(log_folder) #out_obj_folder = init_folder(out_obj_folder) #out_model_folder = init_folder(out_model_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) file_count = 0 class_column = 0 header = True delimiter = ' ' loop_count = -1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str(class_id) + '_' + method + "_top" + str(n_selected_features) +'.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('method: ' + method) logger.info('============') test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector = train_test_file_reading( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) n_samples, n_col = train_x_matrix.shape train_x_matrix = train_x_matrix.reshape(n_samples, attr_num, attr_len) n_samples, n_col = test_x_matrix.shape test_x_matrix = test_x_matrix.reshape(n_samples, attr_num, attr_len) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) min_class = min(train_y_vector) max_class = max(train_y_vector) + 1 for c in range(min_class, max_class): logger.info("Class: " + str(c)) temp_train_y_vector = np.where(train_y_vector == c, 1, 0) temp_test_y_vector = np.where(test_y_vector == c, 1, 0) top_features = fixed_width_forward_multitime(train_x_matrix, temp_train_y_vector, test_x_matrix, temp_test_y_vector, n_selected_features, keep_k, data_keyword, file_key, method, cnn_setting_file, logger) logger.info("Top Features For Class " +str(c) + ": " + str(top_features)) logger.info("End Of Class: " + str(c))
def project_cnn_feature_combined_rf_lda_analysis(feature_matrix, y_vector, logger=None, rf_estimator=50): if logger is None: logger = setup_logger('') #feature_matrix = np.squeeze(feature_matrix) num_instance, num_attribute, num_map = feature_matrix.shape predict = True skip_count = 0 rf_time = 0 lda_time = 0 map_attr_imp_matrix = [] success_count = 0 feature_matrix_2d = feature_matrix.reshape(num_instance, num_attribute * num_map) start_time = 0 rf_feature_value_vector, rf_model, rf_f1_value, rf_run_time = rf_feature_extraction( feature_matrix_2d, y_vector, predict, logger, rf_estimator) rf_time = rf_time + time.time() - start_time start_time = 0 lda_feature_value_vector, lda_model, lda_f1_value, lda_run_time = lda_feature_extraction( feature_matrix_2d, y_vector, predict, logger) lda_time = time.time() - start_time #print rf_feature_value_vector #print np.sum(rf_feature_value_vector) #print lda_feature_value_vector #print np.sum(lda_feature_value_vector) rf_feature_value_vector = rf_feature_value_vector.reshape( num_attribute, num_map) rf_feature_value_vector = np.sum(rf_feature_value_vector, axis=1) sum_value = sum(rf_feature_value_vector) rf_feature_value_vector = rf_feature_value_vector / float(sum_value) rf_feature_value_vector = rf_feature_value_vector * rf_f1_value rf_class_attr_list = map_attr_imp_analysis(rf_feature_value_vector, logger) logger.info("rf feature value: " + str(rf_feature_value_vector)) logger.info("rf f1 value: " + str(rf_f1_value)) logger.info("rf only attr:" + str(rf_class_attr_list)) feature_value_vector = rf_feature_value_vector if lda_feature_value_vector is not None: lda_feature_value_vector = lda_feature_value_vector.reshape( num_attribute, num_map) lda_feature_value_vector = np.sum(lda_feature_value_vector, axis=1) sum_value = sum(lda_feature_value_vector) lda_feature_value_vector = lda_feature_value_vector / float(sum_value) lda_feature_value_vector = lda_feature_value_vector * lda_f1_value lda_class_attr_list = map_attr_imp_analysis(lda_feature_value_vector, logger) lda_max = max(lda_feature_value_vector) logger.info("lda only attr:" + str(lda_class_attr_list)) logger.info("lda feature value: " + str(lda_feature_value_vector)) logger.info("lda f1 value: " + str(lda_f1_value)) logger.info("max lda value: " + str(lda_max)) if lda_max < 0.9: feature_value_vector = feature_value_vector + lda_feature_value_vector #if rf_f1_value > lda_f1_value: # feature_value_vector = rf_feature_value_vector #elif rf_f1_value < lda_f1_value: # feature_value_vector = lda_feature_value_vector #else: # feature_value_vector = rf_feature_value_vector + lda_feature_value_vector sum_value = sum(feature_value_vector) feature_value_vector = feature_value_vector / float(sum_value) class_attr_list = map_attr_imp_analysis(feature_value_vector, logger) logger.info("overall rf and lda feature value: " + str(feature_value_vector)) logger.info("rf and lda attr:" + str(class_attr_list)) return feature_value_vector, rf_time + lda_time
def nn_train(train_x_matrix, train_y_matrix, test_x_matrix, test_y_matrix, train_x_placeholder, train_y_placeholder, logits_out, keep_prob_placeholder, nn_setting, logger): if logger is None: logger = setup_logger('') (overall_len, x_col) = train_x_matrix.shape (y_row, num_classes) = train_y_matrix.shape predict_y_proba = tf.nn.softmax(logits_out) train_y_vector = np.argmax(train_y_matrix, axis=1) max_class = max(train_y_vector) min_class = min(train_y_vector) eval_method = nn_setting.eval_method batch_size = nn_setting.batch_size stop_threshold = nn_setting.stop_threshold max_iter = nn_setting.max_epoch saver_file = nn_setting.save_file cross_entropy, eval_method_value, eval_method_keyword, coefficient_placeholder = cross_entropy_setup(eval_method, num_classes, logits_out, train_y_placeholder) beta = 0.001 full_weight = tf.get_default_graph().get_tensor_by_name("weight_0:0") regularizers = tf.nn.l2_loss(full_weight) cross_entropy = tf.reduce_mean(cross_entropy + regularizers * beta) train_class_index_dict, train_min_length, train_max_length = class_label_vector_checking(train_y_vector) train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) nn_session = tf.InteractiveSession() nn_session.run(tf.global_variables_initializer()) test_eval_value = 0 best_eval_value = 0 i = 0 start = 0 epoch = 0 end = batch_size batch_each_class = int(batch_size/num_classes) saver = tf.train.Saver() train_run_time = 0 np.random.seed(epoch) batch_index = np.random.permutation(overall_len) logger.info("Random Epoch: " + str(epoch) + str(batch_index[0:5])) keep_prob_val = 0.5 while(test_eval_value < stop_threshold): if start >= overall_len: start = 0 end = start + batch_size epoch = epoch + 1 np.random.seed(epoch) logger.info("Random Epoch: " + str(epoch) + str(batch_index[0:5])) print("Random Epoch: " + str(epoch) + str(batch_index[0:5])) batch_index = np.random.permutation(overall_len) elif end > overall_len: end = overall_len batch_x_matrix = train_x_matrix[batch_index[start:end], :] batch_y_matrix = train_y_matrix[batch_index[start:end], :] if eval_method == 'f1' or eval_method == "acc": if i == 0: logger.info("Batch controlled") print("Batch controled") batch_x_matrix, batch_y_matrix, coefficients_vector = batch_control(batch_x_matrix, batch_y_matrix, train_x_matrix, train_y_matrix, i, batch_each_class, min_class, max_class, train_class_index_dict, logger) batch_max_len = float(max(coefficients_vector)) coefficients_vector = batch_max_len/coefficients_vector start_time = time.time() train_step.run(feed_dict={train_x_placeholder: batch_x_matrix, train_y_placeholder: batch_y_matrix, coefficient_placeholder: coefficients_vector, keep_prob_placeholder: keep_prob_val}) train_run_time = train_run_time + time.time() - start_time else: start_time = time.time() train_step.run(feed_dict={train_x_placeholder: batch_x_matrix, train_y_placeholder: batch_y_matrix, keep_prob_placeholder: keep_prob_val}) train_run_time = train_run_time + time.time() - start_time if i % 100 == 0: test_eval_value = eval_method_value.eval(feed_dict={ train_x_placeholder: test_x_matrix, train_y_placeholder: test_y_matrix, keep_prob_placeholder: 1.0}) if str(test_eval_value) == 'nan': test_eval_value = 0 #print_str = "step " + str(i) + ", training " + eval_method_keyword + ": " + str(train_eval_value) #logger.info(print_str) print_str = "step " + str(i) + ", testing " + eval_method_keyword + ": " + str(test_eval_value) logger.info(print_str) print(print_str) if best_eval_value < test_eval_value: # Save the variables to disk. best_eval_value = test_eval_value save_path = saver.save(nn_session, saver_file) print_str = "Model saved in file: " + save_path + ' at iteration: ' + str(i) logger.info(print_str) i = i + 1 start = end end = end + batch_size if epoch > max_iter: logger.info("best eval value at epoch: " + str(epoch)) logger.info("best eval value to break") logger.info(best_eval_value) break start_time = time.time() test_eval_value = eval_method_value.eval(feed_dict={train_x_placeholder: test_x_matrix, train_y_placeholder: test_y_matrix, keep_prob_placeholder: 1.0}) test_run_time = time.time() - start_time if test_eval_value < best_eval_value: nn_session.close() nn_session = tf.InteractiveSession() saver.restore(nn_session, saver_file) else: best_eval_value = test_eval_value logger.info("Running iteration: %d" % (i)) logger.info("final best " + eval_method_keyword + ": " + str(best_eval_value)) logger.info("final test " + eval_method_keyword + ": " + str(test_eval_value)) print("final best " + eval_method_keyword + ": " + str(best_eval_value)) print("final test " + eval_method_keyword + ": " + str(test_eval_value)) nn_predict_proba = nn_session.run(predict_y_proba, feed_dict={train_x_placeholder: test_x_matrix, keep_prob_placeholder: 1.0}) logger.info("NN model saved: " + str(saver_file)) nn_session.close() return best_eval_value, train_run_time, test_run_time, nn_predict_proba
def pv_cnn_generation_main(parameter_file, file_keyword, function_keyword="pv_cnn_generation"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_pv_cnn_generation( parameter_file, function_keyword) print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file log_folder = init_folder(log_folder) out_obj_folder = init_folder(out_obj_folder) out_model_folder = init_folder(out_model_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) obj_list = list_files(obj_folder) file_count = 0 class_column = 0 header = True cnn_setting = return_cnn_setting_from_file(cnn_setting_file) cnn_setting.out_obj_folder = out_obj_folder cnn_setting.out_model_folder = out_model_folder cnn_setting.feature_method = 'save' cnn_setting.eval_method = 'f1' init_folder(out_obj_folder) init_folder(out_model_folder) result_obj_folder = obj_folder + method + "_result_folder" result_obj_folder = init_folder(result_obj_folder) delimiter = ' ' loop_count = -1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( class_id) + '_' + method + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) #logger.info('cnn setting:\n ' + cnn_setting.to_string()) logger.info('method: ' + method) logger.info('============') test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) # Call the projected feature function here, just need to set feature_dict = None feature_dict = None top_k = -1 model_save_file = file_key + '_count' + str(file_count) + '_' + method if method == 'fcn': fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix = run_feature_projected_ijcnn_fcn( train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, data_stru, cnn_setting, feature_dict, top_k, model_save_file, class_id, logger) else: fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix = run_feature_projected_cnn( train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, data_stru, cnn_setting, feature_dict, top_k, model_save_file, class_id, logger) logger.info("Fold F1: " + str(fold_f1_value)) logger.info(method + ' fold training time (sec):' + str(fold_train_time)) logger.info(method + ' fold testing time (sec):' + str(fold_test_time)) logger.info(method + ' fold accuracy: ' + str(fold_accuracy)) logger.info("save obj to " + result_obj_folder + file_key + "_all_feature_" + method + "_result.ckpt") save_obj([ fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix ], result_obj_folder + file_key + "_all_feature_" + method + "_result.ckpt")
def nn_classification_main(parameter_file, file_keyword, function_keyword="nn_classification"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, nn_setting_file = read_all_feature_classification( parameter_file, function_keyword) print(data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, nn_setting_file) log_folder = init_folder(log_folder) out_obj_folder = init_folder(out_obj_folder) out_model_folder = init_folder(out_model_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) file_count = 0 class_column = 0 header = True nn_setting_file = "../../parameters/nn_model_parameter.txt" nn_setting, nn_key = return_nn_setting_from_file(nn_setting_file) result_obj_folder = obj_folder + method + "_result_folder" result_obj_folder = init_folder(result_obj_folder) delimiter = ' ' loop_count = -1 saver_file_profix = "" for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') saver_file_profix = file_key test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) min_class = min(train_y_vector) max_class = max(train_y_vector) num_classes = max_class - min_class + 1 if nn_setting.eval_method == "accuracy": nn_eval_key = "acc" elif num_classes > 2: nn_eval_key = "acc_batch" else: nn_eval_key = "f1" log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( min_class) + "_" + str(max_class) + "_act" + str( nn_setting.activation_fun) + "_" + nn_eval_key + '.log' print("log file: " + log_file) logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('nn setting:\n ' + nn_setting.to_string()) logger.info('method: ' + method) logger.info('============') #train_y_vector[50:80] = 1 #test_y_vector[30:40] = 1 if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) logger.info(train_x_matrix[0, 0:3]) logger.info(test_x_matrix[0, 0:3]) train_y_matrix = y_vector_to_matrix(train_y_vector, num_classes) test_y_matrix = y_vector_to_matrix(test_y_vector, num_classes) feature_dict = None top_k = -1 #model_save_file = file_key + '_count' + str(file_count) + '_' + method nn_eval_value, train_run_time, test_run_time, nn_predict_proba = run_nn( train_x_matrix, train_y_matrix, test_x_matrix, test_y_matrix, nn_setting, logger) logger.info("Fold eval value: " + str(nn_eval_value)) logger.info(method + ' fold training time (sec):' + str(train_run_time)) logger.info(method + ' fold testing time (sec):' + str(test_run_time))
def run_cnn_projected_feature_analysis(feature_folder, class_id, data_folder, data_file_keyword, method="rf_lda", log_folder='./'): data_file_list = list_files(data_folder) feature_file_list = list_files(feature_folder) out_obj_folder = feature_folder[:-1] + "_" + method out_obj_folder = init_folder(out_obj_folder) class_column = 0 for train_file in data_file_list: if data_file_keyword not in train_file: continue data_key = train_file.replace('.txt', '') data_matrix, attr_num = file_reading(data_folder + train_file) train_x_matrix, train_y_vector = x_y_spliting(data_matrix, class_column) #train_y_vector = np.array([0, 0, 1, 1, 1, 1, 2, 2, 2, 3]) if class_id < 0: min_class = min(train_y_vector) max_class = max(train_y_vector) + 1 else: min_class = class_id max_class = min_class + 1 log_file = data_key + "_" + method + "_min" + str( min_class) + "_max" + str(max_class) + ".log" logger = setup_logger(log_folder + log_file) logger.info('data file: ' + train_file) out_obj_file = data_key + "_" + method + "_min" + str( min_class) + "_max" + str(max_class) + ".obj" out_obj_matrix = [] for label in range(min_class, max_class): logger.info("class: " + str(label)) feature_key = "_class" + str(label) + "_" for feature_file in feature_file_list: if data_key not in feature_file or feature_key not in feature_file: continue logger.info("feature file: " + feature_file) feature_obj = load_obj(feature_folder + feature_file) train_feature = obj_processing(feature_obj[0]) logger.info("train feature shape: " + str(train_feature.shape)) class_train_y = np.where(train_y_vector == label, 1, 0) logger.info("feature method: " + str(method)) if method == "rf_lda_sum": class_attr_imp_matrix, class_run_time = project_cnn_feature_combined_rf_lda_analysis( train_feature, class_train_y, logger) elif method == "rf": class_attr_imp_matrix, class_run_time = project_cnn_feature_combined_rf_analysis( train_feature, class_train_y, logger) elif method == "lda": class_attr_imp_matrix, class_run_time = project_cnn_feature_combined_lda_analysis( train_feature, class_train_y, logger) elif method == "cpca": class_attr_imp_matrix, class_run_time = project_cnn_feature_combined_cpca_analysis( train_feature, class_train_y, logger) if method == "cpca": class_attr_list = class_attr_imp_matrix else: logger.info("class attr imp matrix shape: " + str(class_attr_imp_matrix.shape)) class_attr_list = map_attr_imp_analysis( class_attr_imp_matrix, logger) logger.info(class_attr_list) out_obj_matrix.append(class_attr_list) out_obj_matrix = np.array(out_obj_matrix) logger.info("out obj to: " + out_obj_folder + out_obj_file) logger.info(out_obj_matrix.shape) save_obj([out_obj_matrix], out_obj_folder + out_obj_file)
def cnn_classification_main(parameter_file, file_keyword, function_keyword="cnn_classification"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_all_feature_classification( parameter_file, function_keyword) print(data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file) log_folder = init_folder(log_folder) out_obj_folder = init_folder(out_obj_folder) out_model_folder = init_folder(out_model_folder) file_list = list_files(data_folder) file_count = 0 class_column = 0 header = True cnn_setting = return_cnn_setting_from_file(cnn_setting_file) cnn_setting.out_obj_folder = out_obj_folder cnn_setting.out_model_folder = out_model_folder init_folder(out_obj_folder) init_folder(out_model_folder) result_obj_folder = obj_folder + method + "_result_folder" result_obj_folder = init_folder(result_obj_folder) delimiter = ' ' loop_count = -1 saver_file_profix = "" attention_type = 0 attention_type = -1 cnn_setting.attention_type = attention_type trans_bool = False # True: means ins * attr_len * 1 * attr_num # False: means ins * attr_len * attr_num * 1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') saver_file_profix = file_key + "_atten" + str(attention_type) valid_file = data_folder + train_file.replace('train', 'valid') if os.path.isfile(valid_file) is False: valid_file = '' test_file = data_folder + train_file.replace('train', 'test') if os.path.isfile(test_file) is False: test_file = '' data_group, attr_num = train_test_file_reading( data_folder + train_file, test_file, valid_file, class_column, delimiter, header) data_group_processing(data_group, attr_num, trans_bool) data_stru = data_group.gene_data_stru() data_group.data_check(data_stru.num_classes, data_stru.min_class) if cnn_setting.eval_method == "accuracy": cnn_eval_key = "acc" elif num_classes > 2: cnn_eval_key = "acc_batch" else: cnn_eval_key = "f1" log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( data_stru.min_class ) + "_" + str(data_stru.num_classes) + "_act" + str( cnn_setting.activation_fun ) + "_" + cnn_eval_key + "_attention" + str(attention_type) + '.log' print("log file: " + log_file) logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) logger.info('method: ' + method) logger.info('============') if file_count == 0: logger.info('train matrix shape: ' + str(data_group.train_x_matrix.shape)) logger.info('train label shape: ' + str(data_group.train_y_vector.shape)) logger.info(data_group.train_x_matrix[0, 0:3, 0:2, 0]) pred_y_prob, train_run_time, test_run_time, cnn_model = run_cnn( cnn_setting, data_group, saver_file_profix, logger) pred_y_vector = np.argmax(pred_y_prob, axis=1) avg_acc, ret_str = averaged_class_based_accuracy( pred_y_vector, data_group.test_y_vector) acc_value = accuracy_score(data_group.test_y_vector, pred_y_vector, True) logger.info("Averaged acc: " + str(acc_value)) logger.info(ret_str) logger.info("Fold eval value: " + str(acc_value)) logger.info(method + ' fold training time (sec):' + str(train_run_time)) logger.info(method + ' fold testing time (sec):' + str(test_run_time)) logger.info("save obj to " + cnn_model.saver_file)
def multi_proj_feature_classification( parameter_file, file_keyword, function_keyword="multi_proj_feature_classification"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, cnn_obj_folder, cnn_temp_folder, cnn_setting_file = read_feature_classification( parameter_file, function_keyword) log_folder = init_folder(log_folder) if method == 'cnn': return projected_cnn_classification_main(parameter_file, file_keyword) else: # Need to check the rest return False print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, cnn_obj_folder, cnn_temp_folder, cnn_setting_file data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) print obj_folder file_list = list_files(data_folder) obj_list = list_files(obj_folder) class_column = 0 header = True save_obj_folder = obj_folder[:-1] + "_" + method + "_out" save_obj_folder = init_folder(save_obj_folder) delimiter = ' ' loop_count = -1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( class_id) + '_top' + str(top_k) + '_' + method + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('method: ' + method) logger.info('============') found_obj_file = '' for obj_file in obj_list: if file_key in obj_file: found_obj_file = obj_file break if found_obj_file == '': raise Exception('No obj file found') print found_obj_file found_obj_file = obj_folder + found_obj_file feature_array = load_obj(found_obj_file)[0] feature_array = np.array(feature_array) logger.info("feature array shape: " + str(feature_array.shape)) test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) if loop_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) data_stru.attr_num = top_k fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix = run_feature_projected_classification( train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, feature_array, top_k, method, class_id, logger) logger.info("Fold F1: " + str(fold_f1_value)) logger.info(method + ' fold training time (sec):' + str(fold_train_time)) logger.info(method + ' fold testing time (sec):' + str(fold_test_time)) logger.info(method + ' fold accuracy: ' + str(fold_accuracy)) logger.info("save obj to " + save_obj_folder + file_key + "_" + method + "_project_" + method + "_result.ckpt") save_obj([ fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix ], save_obj_folder + file_key + "_" + method + "_project_" + method + "_result.ckpt")
def fixed_width_forward_multitime(train_x, train_y, test_x, test_y, n_selected_features, keep_k=5, data_key="test", fold_key="", method="cnn", cnn_setting_file = "../../parameters/cnn_model_parameter.txt", logger=None, function_key="best_forward_multitime"): """ This function implements the forward feature selection algorithm based on decision tree Input ----- train_x: {3d numpy array matrix}, shape (n_samples, n_features, time_length) input data train_y: {1d numpy array vector}, shape (n_samples,) input class labels test_x: {3d numpy array matrix}, shape (n_samples, n_features, time_length) input data test_y: {1d numpy array vector}, shape (n_samples,) input class labels Output ------ F: {numpy array}, shape (n_features, ) index of selected features """ if logger is None: log_file = "" logger = setup_logger(log_file) train_samples, n_features, time_length = train_x.shape eval_method = "f1" if method == "cnn": min_class = min(train_y) max_class = max(train_y) num_classes = max_class - min_class + 1 data_stru = data_structure(num_classes, min_class, n_features, time_length) cnn_setting = return_cnn_setting_from_file(cnn_setting_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) saver_file_profix = "../../object/" + data_key + "/" +function_key + "/cnn_model_folder/" saver_file_profix = init_folder(saver_file_profix) saver_file_profix = saver_file_profix + fold_key eval_method = cnn_setting.eval_method elif method == "rf": model = RandomForestClassifier(n_estimators=20, random_state=0) # selected feature set, initialized to contain all features F = [] F_eval_score = [] F_available = [] count = len(F) if count == 0: F_available = range(n_features) F_eval_score = np.zeros(n_features) - 1 while count < n_selected_features: max_eval_value = -1 f_score = [] logger.info("For iter " + str(count)) logger.info("available list for this iter: " + str(F_available)) for i in F_available: if i not in F: F.append(i) train_x_tmp = train_x[:, F, :] test_x_tmp = test_x[:, F, :] F_key = str(F)[1:-1] if method == "cnn": eval_value, train_run_time, test_run_time, predict_proba, saver_file, feature_list_obj_file, relu_based_array = model_evaluation_cnn(train_x_tmp, train_y, test_x_tmp, test_y, data_stru, cnn_setting, saver_file_profix + "_F" + F_key, logger) f_eval_value = eval_value elif method == "rf": eval_value, train_run_time, test_run_time = model_evaluation_rf(train_x_tmp, train_y, test_x_tmp, test_y, model, logger) f_eval_value = eval_value if count == 0: F_eval_score[i] = eval_value logger.info("Features With: " + str(F)) logger.info("Adding Feature " + str(i) + ": ") logger.info(method + " " + eval_method + " Value For Feature " + str(i) + ": " + str(f_eval_value)) logger.info(method +" Training time (sec): " + str(train_run_time)) logger.info(method + " Testing time (sec): " + str(test_run_time)) f_score.append(f_eval_value) F.pop() # record the feature which results in the largest accuracy if eval_value > max_eval_value: max_eval_value = eval_value idx = i F_eval_score[idx] = -1 if count == 0: F_available = [] for sel in range(keep_k): add_id = np.argmax(F_eval_score) F_available.append(add_id) F_eval_score[add_id] = -1 else: F_available.remove(idx) add_id = np.argmax(F_eval_score) F_available.append(add_id) F_eval_score[add_id] = -1 logger.info("Eval score vector: " + str(f_score)) logger.info("The added attribute is: " + str(idx)) logger.info("larggest eval value is: " + str(max_eval_value)) # delete the feature which results in the largest accuracy F.append(idx) count += 1 return np.array(F)
def run_z_norm_main(data_folder, file_keyword="train_", logger=None, class_column=0, delimiter=' ', header=True): if logger is None: logger = setup_logger('') if data_folder.endswith('/'): out_folder = data_folder[:-1] + "_z_norm/" else: out_folder = data_folder + "_z_norm/" out_folder = init_folder(out_folder) file_list = list_files(data_folder) file_count = 0 for train_file in file_list: if file_keyword not in train_file: continue logger.info(train_file) test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) #train_x_matrix = train_x_matrix[0:20, :] #test_x_matrix = test_x_matrix[0:20, :] #train_y_vector = train_y_vector[0:20] #test_y_vector = test_y_vector[0:20] train_row, train_col = train_x_matrix.shape test_row, test_col = test_x_matrix.shape attr_len = train_col / attr_num train_x_matrix = train_x_matrix.reshape(train_row, attr_num, attr_len) test_x_matrix = test_x_matrix.reshape(test_row, attr_num, attr_len) norm_train_matrix = run_z_normalization(train_x_matrix) norm_test_matrix = run_z_normalization(test_x_matrix) if file_count == 0: logger.info("Before norm") logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info("After norm") logger.info('train matrix shape: ' + str(norm_train_matrix.shape)) logger.info('test matrix shape: ' + str(norm_test_matrix.shape)) norm_train_matrix = norm_train_matrix.reshape(train_row, train_col) norm_test_matrix = norm_test_matrix.reshape(test_row, test_col) train_y_vector = train_y_vector.reshape(len(train_y_vector), 1) test_y_vector = test_y_vector.reshape(len(test_y_vector), 1) norm_train_matrix = np.hstack((train_y_vector, norm_train_matrix)) norm_test_matrix = np.hstack((test_y_vector, norm_test_matrix)) if file_count == 0: logger.info("before write to file") logger.info('train matrix shape: ' + str(norm_train_matrix.shape)) logger.info('test matrix shape: ' + str(norm_test_matrix.shape)) file_writing(norm_train_matrix, out_folder + train_file, attr_num) file_writing(norm_test_matrix, out_folder + test_file, attr_num) if norm_checking(out_folder + train_file) is False or norm_checking( out_folder + test_file) is False: logger.info("ERROR!!!") raise Exception("ERROR!!!") return False file_count = file_count + 1
def global_classification_main(parameter_file, file_keyword): function_keyword = "global_classification" data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, cnn_obj_folder, cnn_temp_folder, cnn_setting_file = read_feature_classification( parameter_file, function_keyword) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) obj_list = list_files(obj_folder) file_count = 0 class_column = 0 header = True cnn_setting = return_cnn_setting_from_file(cnn_setting_file) cnn_setting.save_obj_folder = cnn_obj_folder cnn_setting.temp_obj_folder = cnn_temp_folder cnn_setting.eval_method = 'f1' init_folder(cnn_obj_folder) init_folder(cnn_temp_folder) all_result_matrix = np.zeros((10, num_classes)) train_file_vector = [] prediction_matrix = [] f1_value_matrix = [] accuracy_vector = [] delimiter = ' ' all_accuracy = 0 all_train_time = 0 all_test_time = 0 loop_count = -1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( class_id) + '_top' + str(top_k) + '_' + method + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) logger.info('method: ' + method) logger.info('============') continue found_obj_file = '' for obj_file in obj_list: if file_key in obj_file: found_obj_file = obj_file break if found_obj_file == '': raise Exception('No obj file found') print found_obj_file print cnn_setting.save_obj_folder + file_key + "_" + method + "_projected_result.ckpt" # found_obj_file = obj_folder + found_obj_file feature_dict = load_obj(found_obj_file)[0] feature_dict = np.array(feature_dict) logger.info("feature array shape: " + str(feature_dict.shape)) test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) data_stru.attr_num = top_k fold_accuracy, fold_avg_eval, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix = run_feature_projected_cnn( train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, data_stru, cnn_setting, feature_dict, top_k, file_key + '_count' + str(file_count), class_id, logger) prediction_matrix.append(fold_predict_y) logger.info("Fold F1: " + str(fold_f1_value_list)) accuracy_vector.append(fold_accuracy) all_accuracy = all_accuracy + fold_accuracy all_train_time = all_train_time + fold_train_time all_test_time = all_test_time + fold_test_time logger.info(method + ' fold accuracy: ' + str(fold_accuracy)) logger.info(method + ' fold training time (sec):' + str(fold_train_time)) logger.info(method + ' fold testing time (sec):' + str(fold_test_time)) save_obj([ fold_accuracy, fold_avg_eval, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix ], save_obj_folder + file_key + "_" + method + "_global_cnn_result.ckpt")