def backward_multitime( train_x, train_y, test_x, test_y, n_selected_features, data_key="test", method="cnn", cnn_setting_file="../../parameters/cnn_model_parameter.txt", logger=None): """ This function implements the backward feature selection algorithm based on decision tree Input ----- train_x: {3d numpy array matrix}, shape (n_samples, n_features, time_length) input data train_y: {1d numpy array vector}, shape (n_samples,) input class labels test_x: {3d numpy array matrix}, shape (n_samples, n_features, time_length) input data test_y: {1d numpy array vector}, shape (n_samples,) input class labels Output ------ F: {numpy array}, shape (n_features, ) index of selected features """ if logger is None: log_file = "" logger = setup_logger(log_file) train_samples, n_features, time_length = train_x.shape f_score = [] eval_method = "f1" if method == "cnn": min_class = min(train_y) max_class = max(train_y) num_classes = max_class - min_class + 1 data_stru = data_structure(num_classes, min_class, n_features, time_length) cnn_setting = return_cnn_setting_from_file(cnn_setting_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) saver_file_profix = "../../object/" + data_key + "/backward_multitime/" + method saver_file_profix = init_folder(saver_file_profix) saver_file_profix = saver_file_profix + return_cnn_keyword(cnn_setting) eval_method = cnn_setting.eval_method all_f_eval_value, all_f_train_time, all_f_test_time, predict_proba, saver_file, feature_list_obj_file, relu_base_array = model_evaluation_cnn( train_x, train_y, test_x, test_y, data_stru, cnn_setting, saver_file_profix, logger) elif method == "rf": model = RandomForestClassifier(n_estimators=50, random_state=0) all_f_eval_value, all_f_train_time, all_f_test_time = model_evaluation_rf( train_x, train_y, test_x, test_y, model, logger) logger.info("With ALL Feature") logger.info(method + " " + eval_method + " Value For ALL Feature: " + str(all_f_eval_value)) logger.info(method + " Training time (sec): " + str(all_f_train_time)) logger.info(method + " Testing time (sec): " + str(all_f_test_time)) # selected feature set, initialized to contain all features F = range(n_features) count = n_features iter_num = 0 while count > n_selected_features: max_eval_value = -1 for i in range(n_features): if i in F: F.remove(i) train_x_tmp = train_x[:, F, :] test_x_tmp = test_x[:, F, :] if method == "cnn": eval_value, train_run_time, test_run_time, predict_proba, saver_file, feature_list_obj_file, relu_based_array = model_evaluation_cnn( train_x_tmp, train_y, test_x_tmp, test_y, data_stru, cnn_setting, saver_file_profix, logger) f_eval_value = all_f_eval_value - eval_value elif method == "rf": eval_value, train_run_time, test_run_time = model_evaluation_rf( train_x_tmp, train_y, test_x_tmp, test_y, model, logger) f_eval_value = all_f_eval_value - eval_value logger.info("Without Feature " + str(i) + ": ") logger.info(method + eval_method + " Value For Feature " + str(i) + ": " + str(f_eval_value)) logger.info(method + " Training time (sec): " + str(train_run_time)) logger.info(method + " Testing time (sec): " + str(test_run_time)) f_score.append(f_eval_value) F.append(i) # record the feature which results in the largest accuracy if eval_value > max_eval_value: max_eval_value = eval_value idx = i logger.info("For iter " + str(iter_num)) logger.info("Eval score vector: " + str(f_score)) logger.info("The removed attribute is: " + str(idx)) # delete the feature which results in the largest accuracy F.remove(idx) count -= 1 iter_num = iter_num + 1 return np.array(F)
def cnn_load_main(parameter_file, file_keyword, function_keyword="cnn_classification"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_all_feature_classification(parameter_file, function_keyword) print(data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file) log_folder = init_folder(log_folder) out_obj_folder = init_folder(out_obj_folder) out_model_folder = init_folder(out_model_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) file_count = 0 class_column = 0 header = True cnn_setting = return_cnn_setting_from_file(cnn_setting_file) cnn_setting.out_obj_folder = out_obj_folder cnn_setting.out_model_folder = out_model_folder cnn_setting.full_feature_num = 400 init_folder(out_obj_folder) init_folder(out_model_folder) print (out_model_folder) model_file_list = list_files(out_model_folder) result_obj_folder = obj_folder + method +"_result_folder" result_obj_folder = init_folder(result_obj_folder) logger = setup_logger('') delimiter = ' ' loop_count = -1 saver_file_profix = "" for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') saver_file_profix = file_key test_file = train_file.replace('train', 'test') #train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading(data_folder + train_file, data_folder + test_file, '', class_column, delimiter, header) data_group, attr_num = train_test_file_reading(data_folder + train_file, data_folder + test_file, '', class_column, delimiter, header) train_x_matrix = data_group.train_x_matrix train_y_vector = data_group.train_y_vector test_x_matrix = data_group.test_x_matrix test_y_vector = data_group.test_y_vector train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) train_y_matrix = y_vector_to_matrix(train_y_vector, num_classes) test_y_matrix = y_vector_to_matrix(test_y_vector, num_classes) found_model_file = "" for model_file in model_file_list: if model_file.startswith(file_key): model_file = model_file.split('.')[0] found_model_file = out_model_folder + model_file + ".ckpt" break if found_model_file == "": raise Exception("No model object file found!!!") print(found_model_file) cnn_session, logits_out, train_x_placeholder, keep_prob_placeholder, keeped_feature_list = load_model(found_model_file, data_stru, cnn_setting, logger) last_conv_tensor = keeped_feature_list[0] train_last_conv = cnn_session.run(last_conv_tensor, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) test_last_conv = cnn_session.run(last_conv_tensor, feed_dict={train_x_placeholder: test_x_matrix, keep_prob_placeholder: 1.0}) drop_num = 10 print(np.squeeze(test_last_conv[1, :, :, :])) test_last_conv = top_attr_x_matrix(test_last_conv, drop_num) print(np.squeeze(test_last_conv[1, :, :, :])) train_last_conv = top_attr_x_matrix(train_last_conv, drop_num) output_y_placeholder = tf.placeholder(tf.float32, [None, num_classes]) actual = tf.argmax(output_y_placeholder, axis=1) prediction = tf.argmax(logits_out, axis=1) correct_prediction = tf.equal(actual, prediction) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) ori_pred_y_vector = cnn_session.run(prediction, feed_dict={train_x_placeholder: test_x_matrix, keep_prob_placeholder: 1.0}) test_accuracy = cnn_session.run(accuracy, feed_dict={train_x_placeholder: test_x_matrix, keep_prob_placeholder: 1.0, output_y_placeholder: test_y_matrix}) cnn_session.close() kernel_eval_matrix, ref_kernel_eval_matrix = last_conv_analysis(train_last_conv, train_y_vector) print(kernel_eval_matrix.shape) print(kernel_eval_matrix) train_ins_len = len(train_y_vector) test_ins_len = len(test_y_vector) batch_size = 100 layer_list = np.array([400]) max_epoch = 10 stop_threshold = 0.99 activation_fun = 3 std_value = 0.02 eval_method = "acc" saver_file = './test_1.save' nn_setting = nn_parameters(layer_list, batch_size, max_epoch, stop_threshold, activation_fun, std_value, eval_method, saver_file) all_pred_prob = [] for c in range(num_classes): train_y_vector_class = np.zeros((train_ins_len)) index_class = np.where(train_y_vector==c)[0] train_y_vector_class[index_class] = 1 train_y_m_class = y_vector_to_matrix(train_y_vector_class, 2) test_y_vector_class = np.zeros((test_ins_len)) index_class = np.where(test_y_vector==c)[0] test_y_vector_class[index_class] = 1 test_y_m_class = y_vector_to_matrix(test_y_vector_class, 2) keep_num = 5 kernel_index = kernel_eval_matrix[c, 0:keep_num] ref_kernel_index = ref_kernel_eval_matrix[c, 0:keep_num] print("kernel index " + str(kernel_index)) print("ref kernel index " + str(ref_kernel_index)) kernel_index = np.concatenate((kernel_index, ref_kernel_index), axis=0) print("union index " + str(kernel_index)) kernel_index = np.unique(kernel_index) print("unique index " + str(kernel_index)) kernel_index = ref_kernel_eval_matrix[c, 0:keep_num] train_x_class = train_last_conv[:, :, :, kernel_index] test_x_class = test_last_conv[:, :, :, kernel_index] print(train_x_class.shape) reshape_col = 45 * len(kernel_index) train_x_class = train_x_class.reshape((train_ins_len, reshape_col)) test_x_class = test_x_class.reshape((test_ins_len, reshape_col)) c_eval_value, c_train_time, c_test_time, c_predict_proba = run_nn(train_x_class, train_y_m_class, test_x_class, test_y_m_class, nn_setting) all_pred_prob.append(c_predict_proba[:, 1]-c_predict_proba[:, 0]) all_pred_prob = np.array(all_pred_prob) print(all_pred_prob.shape) pred_vector = np.argmax(all_pred_prob, axis=0) print(pred_vector) print(all_pred_prob[:, 0]) print(all_pred_prob[:, 1]) print(all_pred_prob[:, 2]) final_accuracy = accuracy_score(pred_vector, test_y_vector) avg_acc, ret_str = averaged_class_based_accuracy(ori_pred_y_vector, test_y_vector) print("original avg acc" + str(avg_acc)) print("original accuracy: " + str(test_accuracy)) print(ret_str) avg_acc, ret_str = averaged_class_based_accuracy(pred_vector, test_y_vector) print("avg acc" + str(avg_acc)) print("new accuracy: " + str(final_accuracy)) print(ret_str) load_result_analysis(all_pred_prob, test_y_vector) sdfds output_y_placeholder = tf.placeholder(tf.float32, [None, num_classes]) actual = tf.argmax(output_y_placeholder, axis=1) prediction = tf.argmax(logits_out, axis=1) correct_prediction = tf.equal(actual, prediction) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) test_eval_value = accuracy.eval(feed_dict={train_x_placeholder: test_x_matrix, output_y_placeholder: test_y_matrix, keep_prob_placeholder: 1.0}) print("fisrt") print(test_eval_value) conv_count = 1 drop_ratio = 0.1 #conv_variable_up_main(cnn_session, conv_count, drop_ratio) weight_name = "conv_w_" + str(0) + ":0" bias_name = "conv_b_" + str(0) + ":0" ori_weight_variable = tf.get_default_graph().get_tensor_by_name(weight_name) ori_bias_variable = tf.get_default_graph().get_tensor_by_name(bias_name) weight_variable = tf.get_default_graph().get_tensor_by_name(weight_name) bias_variable = tf.get_default_graph().get_tensor_by_name(bias_name) ori_weight_variable = cnn_session.run(weight_variable) ori_bias_variable = cnn_session.run(bias_variable) train_drop_acc = [] test_drop_acc = [] for drop_i in range(50): drop_weight_variable = np.copy(ori_weight_variable) drop_bias_variable = np.copy(ori_bias_variable) drop_index = [] drop_index.append(drop_i) up_fir_weight, up_fir_bias = conv_variable_up(drop_weight_variable, drop_bias_variable, drop_index) weight_assign = tf.assign(weight_variable, up_fir_weight) bias_assign = tf.assign(bias_variable, up_fir_bias) cnn_session.run(weight_assign) cnn_session.run(bias_assign) up_bias_variable = tf.get_default_graph().get_tensor_by_name(bias_name) up_bias_variable_val = cnn_session.run(bias_variable) train_eval_value = accuracy.eval(feed_dict={train_x_placeholder: train_x_matrix, output_y_placeholder: train_y_matrix, keep_prob_placeholder: 1.0}) train_drop_acc.append(train_eval_value) test_eval_value = accuracy.eval(feed_dict={train_x_placeholder: test_x_matrix, output_y_placeholder: test_y_matrix, keep_prob_placeholder: 1.0}) test_drop_acc.append(test_eval_value) print ("Drop " + str(drop_i)) print(train_eval_value) print(test_eval_value) print(train_drop_acc) print(train_drop_acc.argsort()) print(test_drop_acc) print(test_drop_acc.argsort()) sdfs print("HERE") fir_weight_variable_val = np.squeeze(fir_weight_variable_val) kernel_dist_val = cnn_session.run(kernel_dist) keep_index_val = cnn_session.run(keep_index) print(fir_weight_variable_val.shape) print(np.amax(fir_weight_variable_val, axis=1)) print(np.amin(fir_weight_variable_val, axis=1)) print(np.mean(fir_weight_variable_val, axis=1)) mean_row = np.mean(fir_weight_variable_val, axis=-1) print(mean_row.shape) dist_list = [] for r in range(40): row = fir_weight_variable_val[:, r] dist_list.append(np.linalg.norm(row-mean_row)) print (dist_list) print(kernel_dist_val) print(keep_index_val) print(sorted(dist_list)) print("!!!") #conv_variable_up(fir_weight_variable_val, fir_bias_variable_val) sdfsd train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading(data_folder + train_file, data_folder + test_file, class_column, delimiter, header) train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) train_x_matrix = test_x_matrix[0:1, :, :, :] #plot_2dmatrix(np.squeeze(train_x_matrix)[:, 0:5]) fir_out_tensor = tf.nn.conv2d(train_x_placeholder, fir_weight_variable, strides=[1, 1, 1, 1], padding='VALID') + fir_bias_variable fir_out_tensor = tf.nn.relu(fir_out_tensor) print(fir_out_tensor.get_shape()) fir_analysis_tensor = tf.reduce_max(fir_out_tensor, [1]) print(fir_analysis_tensor.get_shape()) fir_analysis_tensor = tf.reduce_max(fir_analysis_tensor, [1]) fir_analysis_tensor = tf.reduce_mean(fir_analysis_tensor, [0]) top_k_indices = tf.nn.top_k(fir_analysis_tensor, 10).indices top_k_values = tf.nn.top_k(fir_analysis_tensor, 10).values top_fir_out_tensor = tf.gather(fir_out_tensor, top_k_indices, axis=3) sec_weight_variable = tf.get_default_graph().get_tensor_by_name("conv_w_1:0") sec_bias_variable = tf.get_default_graph().get_tensor_by_name("conv_b_1:0") sec_out_tensor = tf.nn.conv2d(fir_out_tensor, sec_weight_variable, strides=[1, 1, 1, 1], padding='VALID') + sec_bias_variable sec_out_tensor = tf.nn.relu(sec_out_tensor) sec_weight_var_val = cnn_session.run(sec_weight_variable) #print(np.squeeze(sec_weight_var_val)) #sdfds #plot_2dmatrix(fir_weight_var_val[:, 4]) #sdf #print(fir_weight_var_val.T) fir_out_tensor_val = cnn_session.run(fir_out_tensor, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) print(fir_out_tensor_val.shape) top_fir_out_tensor = cnn_session.run(top_fir_out_tensor, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) print(top_fir_out_tensor.shape) fir_analysis_tensor_val = cnn_session.run(fir_analysis_tensor, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) print(fir_analysis_tensor.shape) top_k_indices_val = cnn_session.run(top_k_indices, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) top_k_values_val = cnn_session.run(top_k_values, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) fir_weight_variable_val = cnn_session.run(fir_weight_variable) fir_weight_variable_val = np.squeeze(fir_weight_variable_val) print(fir_weight_variable_val.shape) print(fir_analysis_tensor_val) fir_sort_in = np.argsort(fir_analysis_tensor_val) print(fir_sort_in) print(top_k_indices_val) print(top_k_values_val) plot_2dmatrix(fir_weight_variable_val[:, fir_sort_in[-10:]]) sdfd for n in range(len(fir_out_tensor_val)): for k in range(50): ret_str = "k" + str(k) + ": " kernel_max = -1 max_attr = -1 max_attr_list = [] for a in range(attr_num): attr_max = max(fir_out_tensor_val[n, :, a, k]) max_attr_list.append(attr_max) if attr_max > kernel_max: kernel_max = attr_max max_attr = a if attr_max == 0: ret_str = ret_str + str(a) + " " print(ret_str) print("max attr " + str(max_attr)) print(sorted(range(len(max_attr_list)), key=lambda k: max_attr_list[k])) print("======") print("label " + str(train_y_vector[0])) fir_out_tensor_val = cnn_session.run(sec_out_tensor, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) print(fir_out_tensor_val.shape) sdf for n in range(len(fir_out_tensor_val)): for k in range(40): ret_str = "k" + str(k) + ": " kernel_max = -1 max_attr = -1 max_attr_list = [] for a in range(attr_num): attr_max = max(fir_out_tensor_val[n, :, a, k]) max_attr_list.append(attr_max) if attr_max > kernel_max: kernel_max = attr_max max_attr = a if attr_max == 0: ret_str = ret_str + str(a) + " " print(ret_str) print("max attr " + str(max_attr)) print(sorted(range(len(max_attr_list)), key=lambda k: max_attr_list[k])) print("======") sdf fir_out_mean_val = cnn_session.run(fir_out_mean, feed_dict={train_x_placeholder: train_x_matrix, keep_prob_placeholder: 1.0}) #fir_out_mean_val = np.squeeze(fir_out_mean_val) print(fir_out_mean_val.shape) plot_2dmatrix(np.squeeze(fir_out_mean_val[:, :, 0:5])) sdfd plot_2dmatrix(fir_weight_var_val) min_class = min(train_y_vector) max_class = max(train_y_vector) num_classes = max_class - min_class + 1 if cnn_setting.eval_method == "accuracy": cnn_eval_key = "acc" elif num_classes > 2: cnn_eval_key = "acc_batch" else: cnn_eval_key = "f1" log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str(min_class)+"_" + str(max_class) + "_act" + str(cnn_setting.activation_fun) + "_" + cnn_eval_key + '.log' print("log file: " + log_file) logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) logger.info('method: ' + method) logger.info('============') train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) logger.info(train_x_matrix[0, 0:3, 0:2, 0]) logger.info(test_x_matrix[0, 0:3, 0:2, 0]) train_y_matrix = y_vector_to_matrix(train_y_vector, num_classes) test_y_matrix = y_vector_to_matrix(test_y_vector, num_classes) cnn_eval_value, train_run_time, test_run_time, cnn_predict_proba, saver_file, feature_list_obj_file = run_cnn(train_x_matrix, train_y_matrix, test_x_matrix, test_y_matrix, data_stru, cnn_setting, saver_file_profix, logger) logger.info("Fold eval value: " + str(cnn_eval_value)) logger.info(method + ' fold training time (sec):' + str(train_run_time)) logger.info(method + ' fold testing time (sec):' + str(test_run_time)) logger.info("save obj to " + saver_file)
def cnn_classification_main(parameter_file, file_keyword, function_keyword="cnn_classification"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_all_feature_classification( parameter_file, function_keyword) print(data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file) log_folder = init_folder(log_folder) out_obj_folder = init_folder(out_obj_folder) out_model_folder = init_folder(out_model_folder) file_list = list_files(data_folder) file_count = 0 class_column = 0 header = True cnn_setting = return_cnn_setting_from_file(cnn_setting_file) cnn_setting.out_obj_folder = out_obj_folder cnn_setting.out_model_folder = out_model_folder init_folder(out_obj_folder) init_folder(out_model_folder) result_obj_folder = obj_folder + method + "_result_folder" result_obj_folder = init_folder(result_obj_folder) delimiter = ' ' loop_count = -1 saver_file_profix = "" attention_type = 0 attention_type = -1 cnn_setting.attention_type = attention_type trans_bool = False # True: means ins * attr_len * 1 * attr_num # False: means ins * attr_len * attr_num * 1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') saver_file_profix = file_key + "_atten" + str(attention_type) valid_file = data_folder + train_file.replace('train', 'valid') if os.path.isfile(valid_file) is False: valid_file = '' test_file = data_folder + train_file.replace('train', 'test') if os.path.isfile(test_file) is False: test_file = '' data_group, attr_num = train_test_file_reading( data_folder + train_file, test_file, valid_file, class_column, delimiter, header) data_group_processing(data_group, attr_num, trans_bool) data_stru = data_group.gene_data_stru() data_group.data_check(data_stru.num_classes, data_stru.min_class) if cnn_setting.eval_method == "accuracy": cnn_eval_key = "acc" elif num_classes > 2: cnn_eval_key = "acc_batch" else: cnn_eval_key = "f1" log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( data_stru.min_class ) + "_" + str(data_stru.num_classes) + "_act" + str( cnn_setting.activation_fun ) + "_" + cnn_eval_key + "_attention" + str(attention_type) + '.log' print("log file: " + log_file) logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) logger.info('method: ' + method) logger.info('============') if file_count == 0: logger.info('train matrix shape: ' + str(data_group.train_x_matrix.shape)) logger.info('train label shape: ' + str(data_group.train_y_vector.shape)) logger.info(data_group.train_x_matrix[0, 0:3, 0:2, 0]) pred_y_prob, train_run_time, test_run_time, cnn_model = run_cnn( cnn_setting, data_group, saver_file_profix, logger) pred_y_vector = np.argmax(pred_y_prob, axis=1) avg_acc, ret_str = averaged_class_based_accuracy( pred_y_vector, data_group.test_y_vector) acc_value = accuracy_score(data_group.test_y_vector, pred_y_vector, True) logger.info("Averaged acc: " + str(acc_value)) logger.info(ret_str) logger.info("Fold eval value: " + str(acc_value)) logger.info(method + ' fold training time (sec):' + str(train_run_time)) logger.info(method + ' fold testing time (sec):' + str(test_run_time)) logger.info("save obj to " + cnn_model.saver_file)
def fixed_width_forward_multitime(train_x, train_y, test_x, test_y, n_selected_features, keep_k=5, data_key="test", fold_key="", method="cnn", cnn_setting_file = "../../parameters/cnn_model_parameter.txt", logger=None, function_key="best_forward_multitime"): """ This function implements the forward feature selection algorithm based on decision tree Input ----- train_x: {3d numpy array matrix}, shape (n_samples, n_features, time_length) input data train_y: {1d numpy array vector}, shape (n_samples,) input class labels test_x: {3d numpy array matrix}, shape (n_samples, n_features, time_length) input data test_y: {1d numpy array vector}, shape (n_samples,) input class labels Output ------ F: {numpy array}, shape (n_features, ) index of selected features """ if logger is None: log_file = "" logger = setup_logger(log_file) train_samples, n_features, time_length = train_x.shape eval_method = "f1" if method == "cnn": min_class = min(train_y) max_class = max(train_y) num_classes = max_class - min_class + 1 data_stru = data_structure(num_classes, min_class, n_features, time_length) cnn_setting = return_cnn_setting_from_file(cnn_setting_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) saver_file_profix = "../../object/" + data_key + "/" +function_key + "/cnn_model_folder/" saver_file_profix = init_folder(saver_file_profix) saver_file_profix = saver_file_profix + fold_key eval_method = cnn_setting.eval_method elif method == "rf": model = RandomForestClassifier(n_estimators=20, random_state=0) # selected feature set, initialized to contain all features F = [] F_eval_score = [] F_available = [] count = len(F) if count == 0: F_available = range(n_features) F_eval_score = np.zeros(n_features) - 1 while count < n_selected_features: max_eval_value = -1 f_score = [] logger.info("For iter " + str(count)) logger.info("available list for this iter: " + str(F_available)) for i in F_available: if i not in F: F.append(i) train_x_tmp = train_x[:, F, :] test_x_tmp = test_x[:, F, :] F_key = str(F)[1:-1] if method == "cnn": eval_value, train_run_time, test_run_time, predict_proba, saver_file, feature_list_obj_file, relu_based_array = model_evaluation_cnn(train_x_tmp, train_y, test_x_tmp, test_y, data_stru, cnn_setting, saver_file_profix + "_F" + F_key, logger) f_eval_value = eval_value elif method == "rf": eval_value, train_run_time, test_run_time = model_evaluation_rf(train_x_tmp, train_y, test_x_tmp, test_y, model, logger) f_eval_value = eval_value if count == 0: F_eval_score[i] = eval_value logger.info("Features With: " + str(F)) logger.info("Adding Feature " + str(i) + ": ") logger.info(method + " " + eval_method + " Value For Feature " + str(i) + ": " + str(f_eval_value)) logger.info(method +" Training time (sec): " + str(train_run_time)) logger.info(method + " Testing time (sec): " + str(test_run_time)) f_score.append(f_eval_value) F.pop() # record the feature which results in the largest accuracy if eval_value > max_eval_value: max_eval_value = eval_value idx = i F_eval_score[idx] = -1 if count == 0: F_available = [] for sel in range(keep_k): add_id = np.argmax(F_eval_score) F_available.append(add_id) F_eval_score[add_id] = -1 else: F_available.remove(idx) add_id = np.argmax(F_eval_score) F_available.append(add_id) F_eval_score[add_id] = -1 logger.info("Eval score vector: " + str(f_score)) logger.info("The added attribute is: " + str(idx)) logger.info("larggest eval value is: " + str(max_eval_value)) # delete the feature which results in the largest accuracy F.append(idx) count += 1 return np.array(F)
def pv_classification_cnn(parameter_file, file_keyword, function_keyword="pv_classification"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, cnn_obj_folder, cnn_temp_folder, cnn_setting_file = read_feature_classification( parameter_file, function_keyword) log_folder = init_folder(log_folder) cnn_obj_folder = init_folder(cnn_obj_folder) cnn_temp_folder = init_folder(cnn_temp_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) obj_list = list_files(obj_folder) file_count = 0 class_column = 0 header = True cnn_setting = return_cnn_setting_from_file(cnn_setting_file) cnn_setting.save_obj_folder = cnn_obj_folder cnn_setting.temp_obj_folder = cnn_temp_folder cnn_setting.eval_method = 'f1' init_folder(cnn_obj_folder) init_folder(cnn_temp_folder) save_obj_folder = obj_folder[:-1] + "_" + method + "_out" save_obj_folder = init_folder(save_obj_folder) delimiter = ' ' loop_count = -1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( class_id) + '_top' + str(top_k) + '_' + method + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) logger.info('method: ' + method) logger.info('============') found_obj_file = '' for obj_file in obj_list: if file_key in obj_file: found_obj_file = obj_file break if found_obj_file == '': raise Exception('No obj file found') #print found_obj_file #print cnn_setting.save_obj_folder + file_key + "_" + method +"_projected_result.ckpt" # found_obj_file = obj_folder + found_obj_file feature_dict = load_obj(found_obj_file)[0] feature_dict = np.array(feature_dict) logger.info("feature array shape: " + str(feature_dict.shape)) test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) data_stru.attr_num = top_k if method == 'fcn': fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix = run_feature_projected_ijcnn_fcn( train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, data_stru, cnn_setting, feature_dict, top_k, file_key + '_' + method + '_count' + str(file_count), class_id, logger) else: fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix = run_feature_projected_cnn( train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, data_stru, cnn_setting, feature_dict, top_k, file_key + '_' + method + '_count' + str(file_count), class_id, logger) logger.info("Fold F1: " + str(fold_f1_value)) logger.info(method + ' fold training time (sec):' + str(fold_train_time)) logger.info(method + ' fold testing time (sec):' + str(fold_test_time)) logger.info(method + ' fold accuracy: ' + str(fold_accuracy)) logger.info("save obj to " + save_obj_folder + file_key + "_" + method + "_project_" + method + "_result.ckpt") save_obj([ fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix ], save_obj_folder + file_key + "_" + method + "_project_" + method + "_result.ckpt")
def pv_cnn_generation_main(parameter_file, file_keyword, function_keyword="pv_cnn_generation"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_pv_cnn_generation( parameter_file, function_keyword) print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file log_folder = init_folder(log_folder) out_obj_folder = init_folder(out_obj_folder) out_model_folder = init_folder(out_model_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) obj_list = list_files(obj_folder) file_count = 0 class_column = 0 header = True cnn_setting = return_cnn_setting_from_file(cnn_setting_file) cnn_setting.out_obj_folder = out_obj_folder cnn_setting.out_model_folder = out_model_folder cnn_setting.feature_method = 'save' cnn_setting.eval_method = 'f1' init_folder(out_obj_folder) init_folder(out_model_folder) result_obj_folder = obj_folder + method + "_result_folder" result_obj_folder = init_folder(result_obj_folder) delimiter = ' ' loop_count = -1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( class_id) + '_' + method + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) #logger.info('cnn setting:\n ' + cnn_setting.to_string()) logger.info('method: ' + method) logger.info('============') test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) # Call the projected feature function here, just need to set feature_dict = None feature_dict = None top_k = -1 model_save_file = file_key + '_count' + str(file_count) + '_' + method if method == 'fcn': fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix = run_feature_projected_ijcnn_fcn( train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, data_stru, cnn_setting, feature_dict, top_k, model_save_file, class_id, logger) else: fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix = run_feature_projected_cnn( train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, data_stru, cnn_setting, feature_dict, top_k, model_save_file, class_id, logger) logger.info("Fold F1: " + str(fold_f1_value)) logger.info(method + ' fold training time (sec):' + str(fold_train_time)) logger.info(method + ' fold testing time (sec):' + str(fold_test_time)) logger.info(method + ' fold accuracy: ' + str(fold_accuracy)) logger.info("save obj to " + result_obj_folder + file_key + "_all_feature_" + method + "_result.ckpt") save_obj([ fold_accuracy, fold_f1_value, fold_predict_y, fold_train_time, fold_test_time, fold_predict_matrix ], result_obj_folder + file_key + "_all_feature_" + method + "_result.ckpt")
def cnn_classification_main(parameter_file, file_keyword, function_keyword="cnn_classification"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file = read_all_feature_classification( parameter_file, function_keyword) print data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, method, log_folder, out_obj_folder, out_model_folder, cnn_setting_file log_folder = init_folder(log_folder) out_obj_folder = init_folder(out_obj_folder) out_model_folder = init_folder(out_model_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) obj_list = list_files(obj_folder) file_count = 0 class_column = 0 header = True cnn_setting = return_cnn_setting_from_file(cnn_setting_file) cnn_setting.out_obj_folder = out_obj_folder cnn_setting.out_model_folder = out_model_folder cnn_setting.feature_method = 'none' cnn_key = return_cnn_keyword(cnn_setting) init_folder(out_obj_folder) init_folder(out_model_folder) group_all = False result_obj_folder = obj_folder + method + "_result_folder" result_obj_folder = init_folder(result_obj_folder) delimiter = ' ' loop_count = -1 saver_file_profix = "" for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') saver_file_profix = file_key test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) min_class = min(train_y_vector) max_class = max(train_y_vector) num_classes = max_class - min_class + 1 if cnn_setting.eval_method == "accuracy": cnn_eval_key = "acc" elif num_classes > 2: cnn_eval_key = "acc_batch" else: cnn_eval_key = "f1" log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str( min_class) + "_" + str(max_class) + "_act" + str( cnn_setting.activation_fun) + "_" + cnn_eval_key + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) logger.info('method: ' + method) logger.info('============') #train_y_vector[50:80] = 1 #test_y_vector[30:40] = 1 train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) logger.info(train_x_matrix[0, 0:3, 0:2, 0]) logger.info(test_x_matrix[0, 0:3, 0:2, 0]) train_y_matrix = y_vector_to_matrix(train_y_vector, num_classes) test_y_matrix = y_vector_to_matrix(test_y_vector, num_classes) feature_dict = None top_k = -1 model_save_file = file_key + '_count' + str(file_count) + '_' + method cnn_eval_value, train_run_time, test_run_time, cnn_predict_proba, saver_file, feature_list_obj_file, relu_base_array = run_cnn( train_x_matrix, train_y_matrix, test_x_matrix, test_y_matrix, data_stru, cnn_setting, group_all, saver_file_profix, logger) logger.info("Fold eval value: " + str(cnn_eval_value)) logger.info(method + ' fold training time (sec):' + str(train_run_time)) logger.info(method + ' fold testing time (sec):' + str(test_run_time)) logger.info("save obj to " + saver_file)
def multi_projected_cnn_classification_main(parameter_file, file_keyword, function_keyword="multi_proj_classification"): data_keyword, data_folder, attr_num, attr_len, num_classes, start_class, class_column, class_id, obj_folder, top_k, method, log_folder, cnn_obj_folder, cnn_temp_folder, cnn_setting_file = read_feature_classification(parameter_file, function_keyword) obj_keyword = obj_folder.split('/')[-2] model_saved_folder = "../../object/" + data_keyword + "/projected_classification/" + obj_keyword + "_top" + str(top_k) + "_cnn_model_folder/" print obj_keyword print cnn_obj_folder print model_saved_folder top_keyword = "_top" + str(top_k) + "." group_all = False log_folder = init_folder(log_folder) #cnn_obj_folder = init_folder(cnn_obj_folder) #cnn_temp_folder = init_folder(cnn_temp_folder) data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) file_list = list_files(data_folder) obj_list = list_files(obj_folder) file_count = 0 class_column = 0 header = True cnn_setting = return_cnn_setting_from_file(cnn_setting_file) cnn_setting.save_obj_folder = cnn_obj_folder cnn_setting.temp_obj_folder = cnn_temp_folder cnn_setting.eval_method = 'f1' #init_folder(cnn_obj_folder) #init_folder(cnn_temp_folder) save_obj_folder = "../../object/" + data_keyword + "/" + function_keyword + "/" + obj_keyword + "/" save_obj_folder = init_folder(save_obj_folder) delimiter = ' ' loop_count = -1 for train_file in file_list: if file_keyword not in train_file: continue loop_count = loop_count + 1 file_key = train_file.replace('.txt', '') log_file = log_folder + data_keyword + '_' + file_key + '_' + function_keyword + '_class' + str(class_id) + '_top' + str(top_k) + '_' + method + '.log' print "log file: " + log_file logger = setup_logger(log_file, 'logger_' + str(loop_count)) logger.info('\nlog file: ' + log_file) logger.info(train_file) logger.info('cnn setting:\n ' + cnn_setting.to_string()) logger.info('method: ' + method) logger.info('============') found_obj_file = '' for obj_file in obj_list: if file_key in obj_file: found_obj_file = obj_file break if found_obj_file == '': raise Exception('No obj file found') # found_obj_file = obj_folder + found_obj_file feature_dict = load_obj(found_obj_file)[0] feature_dict = np.array(feature_dict) logger.info("feature array shape: " + str(feature_dict.shape)) test_file = train_file.replace('train', 'test') train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, attr_num = train_test_file_reading_with_attrnum( data_folder + train_file, data_folder + test_file, class_column, delimiter, header) train_x_matrix = train_test_transpose(train_x_matrix, attr_num, attr_len, False) test_x_matrix = train_test_transpose(test_x_matrix, attr_num, attr_len, False) if file_count == 0: logger.info('train matrix shape: ' + str(train_x_matrix.shape)) logger.info('train label shape: ' + str(train_y_vector.shape)) logger.info('test matrix shape: ' + str(test_x_matrix.shape)) logger.info('test label shape: ' + str(test_y_vector.shape)) logger.info("topk: " + str(top_k) ) data_stru.attr_num = top_k fold_accuracy, fold_f1_list, fold_load_time, fold_test_time = run_load_predict_cnn(file_key, model_saved_folder, feature_dict, top_k, test_x_matrix, test_y_vector, data_stru, cnn_setting, group_all, save_obj_folder, logger) logger.info("Fold ACC: " + str(fold_accuracy)) logger.info("Fold F1 list: " + str(fold_f1_list)) logger.info(method + ' fold training time (sec):' + str(fold_load_time)) logger.info(method + ' fold testing time (sec):' + str(fold_test_time))
print out_conv.get_shape() out_conv = tf.reshape(out_conv, [-1, feature_num]) print std_value print feature_num predict_y_prob = conf_out_layer(out_conv, feature_num, num_classes, std_value) #print "predict_y_prob" print predict_y_prob.get_shape() return predict_y_prob, keep_prob_placeholder, keeped_feature_list, saver_file # End of CNN method if __name__ == '__main__': cnn_setting_file = "../../parameters/cnn_model_parameter.txt" cnn_setting = return_cnn_setting_from_file(cnn_setting_file) train_row = 20 test_row = 10 num_classes = 3 attr_num = 45 attr_len = 125 data_stru = return_data_stru(num_classes, 0, attr_num, attr_len, 0) train_x_matrix = np.random.rand(train_row, attr_len, attr_num, 1) test_x_matrix = np.random.rand(test_row, attr_len, attr_num, 1) train_y_vector = np.array( [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 0, 0, 0, 2]) test_y_vector = np.array([0, 0, 0, 1, 1, 1, 0, 0, 2, 2]) train_y_matrix = y_vector_to_matrix(train_y_vector, num_classes) test_y_matrix = y_vector_to_matrix(test_y_vector, num_classes) print train_x_matrix.shape