def clever_cluster(dcpc, k, logger=None): if logger == None: logger = init_logging('') keep_model = None keep_dis = -1 dcpc = dcpc.T for i in range(0, 20): model = KMeans(n_clusters=k).fit(dcpc) centers = np.array(model.cluster_centers_) labels = model.labels_ overall_dis = 0 for label in range(0, k): clu_idx = np.where(labels == label)[0] if len(clu_idx) == 0: continue clu_ins = [] for idx in clu_idx: clu_ins.append(dcpc[idx, :]) clu_ins = np.array(clu_ins) center_label = centers[label] center_label = center_label.reshape(1, len(center_label)) clu_dis = euclidean_distances(clu_ins, center_label) clu_dis = np.sum(clu_dis) overall_dis = overall_dis + clu_dis if keep_dis < 0 or keep_dis > overall_dis: keep_model = model keep_dis = overall_dis print model.labels_ closest, _ = pairwise_distances_argmin_min(keep_model.cluster_centers_, dcpc) return closest
def run_ijcnn_fcn(train_x_matrix, train_y_matrix, test_x_matrix, test_y_matrix, cnn_setting, saver_file_profix='', logger=None): if logger == None: logger = init_logging('') start_class = 0 class_column = 0 train_row, attr_len, attr_num, input_map = train_x_matrix.shape cnn_setting.feature_method = 'none' num_classes = train_y_matrix.shape[1] data_stru = return_data_stru(num_classes, start_class, attr_num, attr_len, class_column) input_x_placeholder = tf.placeholder(tf.float32, [None, attr_len, attr_num, input_map]) output_y_placeholder = tf.placeholder(tf.float32, [None, num_classes]) predict_y_prob, keep_prob_placeholder, keeped_feature_list, saver_file = fcn_configure( input_x_placeholder, num_classes, logger) saver_file = saver_file_profix + saver_file cnn_eval_value, train_run_time, test_run_time, cnn_predict_prob, saver_file, feature_list_obj_file = cnn_train( train_x_matrix, train_y_matrix, test_x_matrix, test_y_matrix, num_classes, cnn_setting, input_x_placeholder, output_y_placeholder, predict_y_prob, keep_prob_placeholder, keeped_feature_list, saver_file, logger) if str(cnn_eval_value) == 'nan': cnn_eval_value = 0 return cnn_eval_value, train_run_time, test_run_time, cnn_predict_prob, saver_file, feature_list_obj_file
def run_projected_cnn(train_x_matrix, train_y_matrix, test_x_matrix, test_y_matrix, data_stru, cnn_setting, group_all=False, saver_file_profix='', logger=None): if logger is None: logger = init_logging('') num_classes = data_stru.num_classes attr_num = data_stru.attr_num attr_len = data_stru.attr_len logger.info(cnn_setting) train_row, attr_len, attr_num, input_map = train_x_matrix.shape data_stru.attr_num = attr_num data_stru.attr_len = attr_len train_x_placeholder, output_y_placeholder, logits_out, keep_prob_placeholder, keeped_feature_list, saver_file = cnn_set_flow_graph( data_stru, cnn_setting, input_map, group_all, logger) saver_file = saver_file_profix + "_group_" + str(group_all) + saver_file cnn_eval_value, train_run_time, test_run_time, cnn_predict_proba, saver_file, feature_list_obj_file = cnn_train( train_x_matrix, train_y_matrix, test_x_matrix, test_y_matrix, num_classes, cnn_setting, train_x_placeholder, output_y_placeholder, logits_out, keep_prob_placeholder, keeped_feature_list, saver_file, logger) if str(cnn_eval_value) == 'nan': cnn_eval_value = 0 return cnn_eval_value, train_run_time, test_run_time, cnn_predict_proba, saver_file, feature_list_obj_file
def fcn_configure(input_x_placeholder, num_classes, logger): if logger == None: logger = init_logging('') #conv_kernel_list = [[1,8], [1,5], [1,3]] conv_kernel_list = [[8, 1], [5, 1], [3, 1]] conv_kernel_list = np.array(conv_kernel_list) feature_num_list = [50, 40, 20] activation_fun = 0 num_input_map = 1 conv_row_num = len(conv_kernel_list) saver_file = '' strides_list = [1, 1, 1, 1] std_value = 0.02 same_size = False out_conv = input_x_placeholder keeped_feature_list = [] for i in range(0, conv_row_num): logger.info('layer: ' + str(i) + " input:") logger.info(out_conv.get_shape()) conv_row_kernel = conv_kernel_list[i, 0] conv_col_kernel = conv_kernel_list[i, 1] num_output_map = feature_num_list[i] saver_file = saver_file + "_c" + str(conv_row_kernel) + "_" + str( conv_col_kernel) out_conv = conf_conv_layer(i, conv_row_kernel, conv_col_kernel, out_conv, num_input_map, num_output_map, activation_fun, strides_list, std_value, same_size) logger.info("Conv output: " + str(out_conv.get_shape())) out_conv = tf.layers.batch_normalization(out_conv) logger.info("Conv after batch normal: " + str(out_conv.get_shape())) num_input_map = num_output_map row_samp_rate = out_conv.get_shape()[1] col_samp_rate = 1 out_conv = conf_pool_layer(out_conv, row_samp_rate, col_samp_rate, False) keeped_feature_list.append(out_conv) logger.info("Feature result shape") logger.info(out_conv.get_shape()) saver_file = saver_file + "global_p" + str(row_samp_rate) + "_" + str( col_samp_rate) + '.ckpt' #dropout keep_prob_placeholder = tf.placeholder(tf.float32) out_conv = tf.nn.dropout(out_conv, keep_prob_placeholder) out_fir, out_sec, out_thi, out_for = out_conv.get_shape() feature_num = int(out_sec * out_thi * out_for) print out_conv.get_shape() out_conv = tf.reshape(out_conv, [-1, feature_num]) print std_value print feature_num predict_y_prob = conf_out_layer(out_conv, feature_num, num_classes, std_value) #print "predict_y_prob" print predict_y_prob.get_shape() return predict_y_prob, keep_prob_placeholder, keeped_feature_list, saver_file
def rf_feature_extraction(x_matrix, y_vector, predict=False, logger=None, rf_estimator=50): if logger is None: logger = init_logging('') rf_model = ExtraTreesClassifier(n_estimators=rf_estimator, random_state=0) start_time = time.time() rf_model.fit(x_matrix, y_vector) run_time = time.time() - start_time feature_value_vector = np.absolute(rf_model.feature_importances_) #sum_value = float(np.sum(feature_value_vector)) #feature_value_vector = feature_value_vector/sum_value #feature_value_vector = preprocessing.normalize(feature_value_vector.reshape(1, len(feature_value_vector)), norm='l2')[0] if predict is True: predict_y = rf_model.predict(x_matrix) #averaged_acc = averaged_class_based_accuracy(predict_y, y_vector) accuracy, precision, recall, f1_value, tp, fp, tn, fn = f1_value_precision_recall_accuracy( predict_y, y_vector, 1) else: averaged_acc = -1 return feature_value_vector, rf_model, f1_value, run_time
def run_cnn_lda_class_based_feature_analysis_main(feature_folder, feature_file_keyword, class_label, data_folder, data_stru, logger=None): if logger == None: logger = init_logging('') lda_weight_feature_matrix = [] feature_file_list = listFiles(feature_folder) overall_time = 0 file_count = 0 class_keyword = 'class_'+ str(class_label)+'_' file_demiliter = '_' for feature_file in feature_file_list: if feature_file_keyword not in feature_file or class_keyword not in feature_file: continue logger.info(feature_file) file_count = file_count + 1 feature_file_array = feature_file.split(file_demiliter) train_file = feature_file_array[1] + file_demiliter + feature_file_array[2] + '.txt' logger.info(train_file) train_x_matrix, train_y_vector = readFile(data_folder + train_file) temp_train_y_vector = np.where(train_y_vector==class_label, 1, 0) fold_positive_len = len(np.where(temp_train_y_vector == 1)[0]) fold_negative_len = len(temp_train_y_vector) - fold_positive_len logger.info("=====") logger.info("positive class labels length: " + str(fold_positive_len)) logger.info("negative class labels length: " + str(fold_negative_len)) [fold_train_sensor_result, fold_weight_fullconn, fold_bias_fullconn] = load_obj(feature_folder + feature_file) logger.info(fold_train_sensor_result.shape) logger.info(fold_weight_fullconn.shape) logger.info(fold_bias_fullconn.shape) fold_train_sensor_result = np.squeeze(fold_train_sensor_result) logger.info(fold_train_sensor_result.shape) fold_attr_imp_index, skip_count, fold_time = project_cnn_feature_lda_analysis(fold_train_sensor_result, temp_train_y_vector, logger) overall_time = overall_time + fold_time logger.info(fold_attr_imp_index.shape) logger.info("skip: " + str(skip_count)) lda_weight_feature_matrix.append(fold_attr_imp_index) #if file_count > 2: # break #break lda_weight_feature_matrix = np.array(lda_weight_feature_matrix) lda_weight_feature_matrix = np.squeeze(lda_weight_feature_matrix) logger.info(lda_weight_feature_matrix.shape) start_time = time.time() class_based_value_array = np.sum(lda_weight_feature_matrix, axis=0) class_based_index_array = np.argsort(-class_based_value_array) overall_time = overall_time + time.time() - start_time return class_based_index_array, class_based_value_array, overall_time
def run_cnn_pca_class_based_feature_analysis_main(feature_folder, feature_file_keyword, class_label, data_folder, data_stru, logger=None): if logger == None: logger = init_logging('') weight_feature_matrix = [] feature_file_list = listFiles(feature_folder) overall_time = 0 file_count = 0 class_keyword = 'class_'+ str(class_label)+'_' file_demiliter = '_' for feature_file in feature_file_list: if feature_file_keyword not in feature_file or class_keyword not in feature_file: continue logger.info(feature_file) #print(feature_file) file_count = file_count + 1 feature_file_array = feature_file.split(file_demiliter) train_file = feature_file_array[1] + file_demiliter + feature_file_array[2] + '.txt' logger.info(train_file) train_x_matrix, train_y_vector = readFile(data_folder + train_file) class_label_index = np.where(train_y_vector==class_label)[0] logger.info("=====") logger.info("positive class labels length: " + str(len(class_label_index))) [fold_train_sensor_result, fold_weight_fullconn, fold_bias_fullconn] = load_obj(feature_folder + feature_file) logger.info(fold_train_sensor_result.shape) logger.info(fold_weight_fullconn.shape) logger.info(fold_bias_fullconn.shape) fold_train_sensor_result = np.squeeze(fold_train_sensor_result) fold_train_sensor_result = fold_train_sensor_result[class_label_index, :, :] logger.info(fold_train_sensor_result.shape) start_time = time.time() fold_attr_imp_index, fold_attr_imp = run_pca_proj_feature_3D(fold_train_sensor_result) overall_time = overall_time + time.time() - start_time logger.info(fold_attr_imp_index.shape) logger.info(fold_attr_imp.shape) weight_feature_matrix.append(fold_attr_imp_index) #if file_count > 2: # break #break weight_feature_matrix = np.array(weight_feature_matrix) weight_feature_matrix = weight_feature_matrix.astype(int) logger.info(weight_feature_matrix.shape) start_time = time.time() #print weight_feature_matrix lda_feature_vector, lda_feature_value_vector = majority_vote_index(weight_feature_matrix, -1) #print lda_feature_matrix logger.info(lda_feature_vector.shape) overall_time = overall_time + time.time() - start_time return lda_feature_vector, lda_feature_value_vector, overall_time
def run_cnn_combined_rf_lda_class_based_feature_analysis_main(feature_folder, feature_file_pre, feature_file_post, start_class, end_class, data_folder, data_stru, logger=None): if logger == None: logger = init_logging('') feature_weight_feature_matrix = [] data_file_list = listFiles(data_folder) overall_time = 0 for train_file in data_file_list: if 'train_' not in train_file: continue logger.info(train_file) train_keyword = train_file.replace('.txt', '') fold_feature_weight_matrix = [] train_x_matrix, train_y_vector = readFile(data_folder + train_file) for class_label in range(start_class, end_class): logger.info("class label: " + str(class_label)) feature_file = feature_file_pre + train_keyword + "_class_" + str(class_label) + feature_file_post [fold_train_sensor_result, fold_weight_fullconn, fold_bias_fullconn] = load_obj(feature_folder + feature_file) fold_train_sensor_result = np.squeeze(fold_train_sensor_result) temp_train_y_vector = np.where(train_y_vector==class_label, 1, 0) fold_attr_imp_index, fold_time = project_cnn_feature_combined_rf_lda_analysis(fold_train_sensor_result, temp_train_y_vector, logger) overall_time = overall_time + fold_time if class_label == 0: logger.info(fold_train_sensor_result.shape) logger.info(fold_weight_fullconn.shape) logger.info(fold_bias_fullconn.shape) logger.info(fold_attr_imp_index.shape) fold_feature_weight_matrix.append(fold_attr_imp_index) fold_feature_weight_matrix = np.array(fold_feature_weight_matrix) logger.info("fold_feature_weight_matrix.shape") logger.info(fold_feature_weight_matrix.shape) fold_feature_weight_matrix = np.sum(fold_feature_weight_matrix, axis=1) logger.info("fold_feature_weight_matrix final shape") logger.info(fold_feature_weight_matrix.shape) feature_weight_feature_matrix.append(fold_feature_weight_matrix) feature_weight_feature_matrix = np.array(feature_weight_feature_matrix) logger.info("feature_weight_feature_matrix.shape") logger.info(feature_weight_feature_matrix.shape) start_time = time.time() feature_weight_feature_matrix = np.sum(feature_weight_feature_matrix, axis=0) feature_index_feature_matrix = np.argsort(-feature_weight_feature_matrix, axis=1) overall_time = overall_time + time.time() - start_time logger.info(feature_index_feature_matrix.shape) logger.info(feature_index_feature_matrix[0:5, 0:6]) logger.info(feature_weight_feature_matrix.shape) logger.info(feature_weight_feature_matrix[0:5, 0:6]) logger.info("fold cnn combined rf and lda projected feature generation overall time (sec)") logger.info(overall_time) return feature_index_feature_matrix, feature_weight_feature_matrix, overall_time
def multiple_f1_value_precision_recall_accuracy(predict_y_vector, real_y_vector, logger=None): if logger == None: logger = init_logging('') if len(predict_y_vector) != len(real_y_vector): raise Exception("Length for prediction is not same") min_class = min(real_y_vector) max_class = max(real_y_vector) instance_num = len(predict_y_vector) f1_value_list = [] for i in range(min_class, max_class + 1): class_predict_y = np.where(predict_y_vector == i, 1, 0) class_real_y = np.where(real_y_vector == i, 1, 0) #print class_predict_y #print class_real_y #print "===" tp = 0 tn = 0 fp = 0 fn = 0 for instance_index in range(0, instance_num): predict = int(class_predict_y[instance_index]) real = int(class_real_y[instance_index]) if real == 1: if predict == real: tp = tp + 1 else: fn = fn + 1 else: if predict == real: tn = tn + 1 else: fp = fp + 1 if tp == 0: precision = 0 recall = 0 f1_value = 0 else: precision = float(tp) / float(tp + fp) recall = float(tp) / float(tp + fn) f1_value = float( 2 * precision * recall) / float(precision + recall) f1_value_list.append(f1_value) f1_value_list = np.array(f1_value_list) accuracy = 0 for instance_index in range(0, instance_num): predict = int(predict_y_vector[instance_index]) real = int(real_y_vector[instance_index]) if predict == real: accuracy = accuracy + 1 accuracy = float(accuracy) / float(instance_num) return accuracy, f1_value_list
def clever_rank(dcpc, logger=None): if logger is None: logger = init_logging('') top_p, attr_num = dcpc.shape attr_score = {} for a in range(0, attr_num): var_score = dcpc[:, a] l2_score = np.sum(np.square(var_score)) attr_score[a] = l2_score return attr_score
def project_cnn_feature_combined_rf_lda_analysis(feature_matrix, y_vector, logger=None): if logger == None: logger = init_logging('') num_instance, num_attribute, num_map = feature_matrix.shape map_attr_imp_matrix = [ ] # used to store all attribute importance from each map map_attr_imp_index_matrix = [] # used to store predict = True skip_count = 0 rf_time = 0 lda_time = 0 for i in range(0, num_map): map_feature_matrix = feature_matrix[:, :, i] start_time = 0 feature_vector_norm, feature_value_vector, rf_model, averaged_acc = rf_feature_extraction( map_feature_matrix, y_vector, predict, logger) rf_time = rf_time + time.time() - start_time #if averaged_acc != -1: # logger.info("RF accuracy:") # logger.info(averaged_acc) # feature_value_vector = feature_value_vector * averaged_acc if i == 0: logger.info(feature_value_vector.shape) #if np.any(map_feature_matrix) == False: #print "do not know why" # skip_count = skip_count + 1 # map_attr_imp_matrix.append(feature_value_vector) # continue start_time = 0 lda_feature_vector_norm, lda_feature_value_vector, lda_model, lda_averaged_acc = gene_lda_feature_v2( map_feature_matrix, y_vector, predict, logger) lda_time = lda_time + time.time() - start_time #if lda_averaged_acc != -1: # logger.info("LDA accuracy:") # logger.info(lda_averaged_acc) # lda_feature_value_vector = lda_feature_value_vector * lda_averaged_acc feature_value_vector = feature_value_vector + lda_feature_value_vector #print feature_value_vector map_attr_imp_matrix.append(feature_value_vector) #len_feature = len(feature_value_vector) #sort_weight_vector = np.argsort(feature_value_vector) #feature_vector_norm = np.zeros(len_feature) #for i in range(0, len_feature): # feature_vector_norm[sort_weight_vector[i]] = i #map_attr_imp_matrix.append(feature_vector_norm) map_attr_imp_matrix = np.array(map_attr_imp_matrix) logger.info(map_attr_imp_matrix.shape) return map_attr_imp_matrix, rf_time + lda_time
def run_cnn_lda_feature_analysis_main(feature_folder, feature_file_keyword, data_folder, data_stru, feature_postfix, logger=None): if logger == None: logger = init_logging('') num_classes = data_stru.num_classes start_class = data_stru.start_class attr_num = data_stru.attr_num class_column = data_stru.class_column lda_weight_feature_matrix = [] file_list = listFiles(data_folder) feature_file_list = listFiles(feature_folder) overall_time = 0 file_count = 0 file_demiliter = '_' for train_file in file_list: logger.info(train_file) file_count = file_count + 1 for feature_file in feature_file_list: if feature_file_keyword not in feature_file or train_file not in feature_file: continue logger.info(feature_file) train_x_matrix, train_y_vector = readFile(data_folder + train_file) [fold_train_sensor_result, fold_weight_fullconn, fold_bias_fullconn] = load_obj(feature_folder + feature_file) logger.info(fold_train_sensor_result.shape) logger.info(fold_weight_fullconn.shape) logger.info(fold_bias_fullconn.shape) fold_train_sensor_result = np.squeeze(fold_train_sensor_result) logger.info(fold_train_sensor_result.shape) fold_attr_imp_index, fold_attr_imp, skip_count, fold_time = project_cnn_feature_lda_analysis(fold_train_sensor_result, train_y_vector, logger) overall_time = overall_time + fold_time logger.info(fold_attr_imp_index.shape) logger.info(fold_attr_imp.shape) logger.info("skip: " + str(skip_count)) lda_weight_feature_matrix.append(fold_attr_imp_index) #if file_count > 2: # break #break lda_weight_feature_matrix = np.array(lda_weight_feature_matrix) lda_weight_feature_matrix = lda_weight_feature_matrix.astype(int) logger.info(lda_weight_feature_matrix.shape) start_time = time.time() lda_feature_matrix = fold_feature_combination_F_C_A(lda_weight_feature_matrix) logger.info(lda_feature_matrix.shape) overall_time = overall_time + time.time() - start_time logger.info(lda_feature_matrix.shape) logger.info(lda_feature_matrix[0:5, 0:6]) return lda_feature_matrix, overall_time
def computeDCPC(mts_data, threshold=0.9, logger=None): if logger == None: logger = init_logging('') row_num, attr_len, attr_num = mts_data.shape print mts_data.shape loading = [] percent = [] for r in range(0, row_num): mts_item = mts_data[r, :, :] #mts_item = standardization(mts_item) #logger.info("mts item: " + str(mts_item.shape)) corr_matrix = np.corrcoef(mts_item) indices = np.where(np.isnan(corr_matrix)) corr_matrix[indices] = 0 #logger.info("corr_matrix: " + str(corr_matrix.shape)) u, s, vh = np.linalg.svd(corr_matrix, full_matrices=True) percent_var = (s / sum(s)) p_sum = float(0) for p in range(0, len(percent_var)): p_sum = p_sum + percent_var[p] if p_sum >= threshold: break percent.append(p) #logger.info("u: " + str(u.shape)) loading.append(u) p = max(percent) h_matrix = [] for r in range(0, row_num): load_m = loading[r] mul_load = np.multiply(load_m.T, load_m) if len(h_matrix) == 0: h_matrix = mul_load else: h_matrix = h_matrix + mul_load logger.info(h_matrix.shape) indices = np.where(np.isnan(h_matrix)) h_matrix[indices] = 0 dcpc, h_s, h_v = np.linalg.svd(h_matrix, full_matrices=True) logger.info(dcpc[0:p, :]) return dcpc[0:p, :]
def run_dcpc_main(data_folder, class_column, num_classes, obj_folder, threshold, logger=None): if logger == None: logger = init_logging('') file_list = list_files(data_folder) overall_time = 0 file_count = 0 out_obj_dict = {} for train_file in file_list: if "train_" not in train_file: continue logger.info(train_file) out_obj_file = train_file.replace('.txt', '_dcpc.obj') file_count = file_count + 1 test_file = train_file.replace('train_', 'test_') x_matrix, y_vector = file_read_split(data_folder + train_file) min_class = min(y_vector) max_class = max(y_vector) + 1 #logger.info("x matrix tran after shape: " + str(x_matrix.shape)) #x_matrix = x_matrix.transpose((0, 2, 1)) logger.info("x matrix tran after shape: " + str(x_matrix.shape)) for label in range(min_class, max_class): label_index = np.where(y_vector == label)[0] label_x_matrix = x_matrix[label_index, :, :] logger.info("class: " + str(label)) print "class: " + str(label) logger.info("x matrix tran before shape: " + str(label_x_matrix.shape)) label_dcpc = computeDCPC(label_x_matrix, threshold) logger.info("class: " + str(label) + " dcpc shape: " + str(label_dcpc.shape)) out_obj_dict[label] = label_dcpc logger.info("dcpc out obj: " + str(obj_folder + out_obj_file)) save_obj([out_obj_dict], obj_folder + out_obj_file)
def cnn_set_flow_graph(data_stru, cnn_setting, input_map, group_all=False, logger=None): if logger is None: logger = init_logging('') tf.reset_default_graph() tf.random.set_random_seed(0) attr_num = data_stru.attr_num attr_len = data_stru.attr_len num_classes = data_stru.num_classes output_y_placeholder = tf.placeholder(tf.float32, [None, num_classes]) train_x_placeholder = tf.placeholder(tf.float32, [None, attr_len, attr_num, input_map]) logits_out, keep_prob_placeholder, keeped_feature_list, saver_file = cnn_configure( train_x_placeholder, cnn_setting, num_classes, group_all, logger) return train_x_placeholder, output_y_placeholder, logits_out, keep_prob_placeholder, keeped_feature_list, saver_file
def cnn_feature_lda(train_x_matrix, train_y_vector, predict=False, logger=None): if logger == None: logger = init_logging('') train_norm_vector = np.linalg.norm(train_x_matrix, axis=0, ord=np.inf)[None, :] #print train_norm_vector train_x_matrix = np.true_divide(train_x_matrix, train_norm_vector, where=(train_norm_vector != 0)) train_x_matrix[np.isnan(train_x_matrix)] = 0 train_x_matrix[np.isinf(train_x_matrix)] = 1 #print train_x_matrix[0:3, 0:5] lda_model, train_time = bi_gene_lda_model(train_x_matrix, train_y_vector) #feature_value_vector = np.absolute(lda_model.coef_[0]) feature_value_vector = np.absolute(lda_model.scalings_.T[0]) #logger.info("predict_bool: " + str(predict)) if predict == True: feature_value_vector = preprocessing.normalize( feature_value_vector.reshape(1, len(feature_value_vector)), norm='l2')[0] predict_y = lda_model.predict(train_x_matrix) averaged_acc = averaged_class_based_accuracy(predict_y, train_y_vector) #mean_acc = lda_model.score(train_x_matrix, train_y_vector) #logger.info('mean_acc: ' + str(mean_acc)) #logger.info('averaged_acc: ' + str(averaged_acc)) else: averaged_acc = -1 len_weight = len(feature_value_vector) sort_feature_value_vector = np.argsort(feature_value_vector) feature_vector_norm = np.zeros(len_weight) for i in range(0, len_weight): feature_vector_norm[sort_feature_value_vector[i]] = i return feature_vector_norm, feature_value_vector, lda_model, averaged_acc
def lda_feature_extraction(x_matrix, y_vector, predict=False, logger=None): if logger is None: logger = init_logging('') train_norm_vector = np.linalg.norm(x_matrix, axis=0, ord=np.inf)[None, :] #print "train_norm_vector" #print train_norm_vector x_matrix = np.true_divide(x_matrix, train_norm_vector, where=(train_norm_vector != 0)) x_matrix[np.isnan(x_matrix)] = 0 x_matrix[np.isinf(x_matrix)] = 1 if x_matrix.max() == x_matrix.min(): return None, None, -1, 0 if np.any(x_matrix) is False: return None, None, -1, 0 prior_vector = [] min_class = min(y_vector) max_class = max(y_vector) + 1 all_count = len(y_vector) for i in range(min_class, max_class): c_count = len(np.where(y_vector == i)[0]) prior_vector.append(float(c_count) / all_count) lda_model = LinearDiscriminantAnalysis(priors=prior_vector) start_time = time.time() lda_model.fit(x_matrix, y_vector) run_time = time.time() - start_time feature_value_vector = np.absolute(lda_model.scalings_.T[0]) #sum_value = float(np.sum(feature_value_vector)) #feature_value_vector = feature_value_vector/sum_value #feature_value_vector = preprocessing.normalize(feature_value_vector.reshape(1, len(feature_value_vector)), norm='l2')[0] if predict is True: predict_y = lda_model.predict(x_matrix) #averaged_acc = averaged_class_based_accuracy(predict_y, y_vector) accuracy, precision, recall, f1_value, tp, fp, tn, fn = f1_value_precision_recall_accuracy( predict_y, y_vector, 1) else: averaged_acc = -1 return feature_value_vector, lda_model, f1_value, run_time
def conf_conv_layer(layer, kernel_r, kernel_c, input_matrix, num_input_map, num_output_map, activation_fun=0, strides_list=[1, 1, 1, 1], std_value=0.1, same_size=False, logger=None): #if layer == 0: # std_value = sqrt(0.2) #else: # std_value = sqrt(0.2 / num_input_map) if logger is None: logger = init_logging("") tf.random.set_random_seed(layer) weight_variable = tf.Variable(tf.truncated_normal( [kernel_r, kernel_c, num_input_map, num_output_map], stddev=std_value), name='conv_w_' + str(layer)) bias_variable = tf.Variable(tf.constant(std_value, shape=[num_output_map]), name='conv_b_' + str(layer)) #bias_variable = tf.Variable(tf.constant(std_value, shape=[num_output_map])) #weight_variable = tf.Variable(tf.truncated_normal([kernel_r, kernel_c, num_input_map, num_output_map], stddev=std_value), name='conv_weight_'+str(layer)) #bias_variable = tf.Variable(tf.constant(0.0, shape=[num_output_map]), name='conv_bias_'+str(layer)) if same_size == "True": str_padding = 'SAME' else: str_padding = 'VALID' ret_conv_before_act = tf.nn.conv2d(input_matrix, weight_variable, strides=[1, 1, 1, 1], padding=str_padding) + bias_variable ret_conv = tf.nn.relu(ret_conv_before_act) return ret_conv
def run_load_predict_cnn(fold_keyword, model_saved_folder, feature_array, top_k, test_x_matrix, test_y_vector, data_stru, cnn_setting, group_all=True, save_obj_folder="./", logger=None): if logger is None: logger = init_logging('') real_num_classes = data_stru.num_classes model_list = list_files(model_saved_folder) data_stru.num_classes = 2 load_time = 0 test_time = 0 multi_predict = [] for c in range(real_num_classes): logger.info("Class: " + str(c)) class_keyword = "class" + str(c) + "_" found_model_file = "" for model_file in model_list: if ".index" not in model_file: continue if fold_keyword not in model_file: continue if class_keyword not in model_file: continue found_model_file = model_file.replace(".index", "") print (found_model_file) break if found_model_file == "": raise Exception("Model for " + class_keyword + " and " + fold_keyword + " Not Found!!!") else: found_model_file = model_saved_folder + found_model_file class_feature = feature_array[c] class_feature = class_feature[0:top_k] logger.info("model file: " + str(model_saved_folder + found_model_file)) logger.info("feature list: " + str(class_feature)) temp_test_x_matrix = test_x_matrix[:, :, class_feature, :] logger.info("In run_load_predict_cnn: " + str(temp_test_x_matrix.shape)) start_time = time.time() cnn_session, predict_y_proba, train_x_placeholder, keep_prob_placeholder = load_model(found_model_file, data_stru, cnn_setting, group_all, logger) load_time = load_time + time.time() - start_time start_time = time.time() cnn_predict_proba = load_model_predict(cnn_session, temp_test_x_matrix, predict_y_proba, train_x_placeholder, keep_prob_placeholder) #print (cnn_predict_proba[0:10, :]) test_time = test_time + time.time() - start_time multi_predict.append(cnn_predict_proba[:, 1]) cnn_session.close() multi_predict = np.array(multi_predict) #print multi_predict[0:2, 5:11] multi_predict_vector = np.argmax(multi_predict, axis=0) save_obj_file = save_obj_folder + fold_keyword + "_" + str(top_k) + ".out" save_obj([multi_predict], save_obj_file) logger.info("output obj saved to: " + save_obj_file) logger.info("multi predict matrix shape: " + str(multi_predict.shape)) logger.info("multi predict vector shape: " + str(multi_predict_vector.shape)) #print (str(multi_predict_vector[0:10])) logger.info("test y vector: " + str(test_y_vector.shape)) #print (str(test_y_vector[0:10])) acc = accuracy_score(test_y_vector, multi_predict_vector) data_stru.num_classes = real_num_classes acc1, f1_list = multiple_f1_value_precision_recall_accuracy(multi_predict_vector, test_y_vector, logger) if acc != acc1: raise Exception("check accuracy") return acc, f1_list, load_time, test_time
def run_feature_svm_load_proba(model_pre, test_x_matrix, test_y_vector, feature_array, attr_num, logger=None): if logger is None: logger = init_logging("") logger.info('no log file: ') num_classes, num_features = feature_array.shape test_row, test_col = test_x_matrix.shape svm_predict_matrix = np.zeros(test_row * num_classes).reshape( test_row, num_classes) svm_predict_proba = np.zeros(test_row * num_classes).reshape( test_row, num_classes) svm_train_time = 0 svm_test_time = 0 svm_accuracy = 0 proba = True for i in range(0, num_classes): #print 'class: ' + str(i) #print feature_array[i] logger.info("class: " + str(i)) logger.info(str(feature_array[i])) #temp_train_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation(train_x_matrix, attr_num, feature_array[i]) temp_test_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation( test_x_matrix, attr_num, feature_array[i]) model_file = model_pre + '_class' + str(i) + "_top" + str( temp_attr_len) + ".model" print model_file logger.info('model file: ' + model_file) start_time = time.time() svm_model = svm_load_model(model_file) temp_train_time = time.time() - start_time svm_train_time = svm_train_time + temp_train_time #print 'class: ' + str(i) if i == 0: logger.info('sub feature data shape: ') logger.info(str(temp_test_x_matrix.shape)) temp_test_y_vector = np.where(test_y_vector == i, 1, 0) temp_test_x_matrix = temp_test_x_matrix.tolist() temp_test_y_vector = temp_test_y_vector.astype(np.integer).tolist() ###START FROM HERE start_time = time.time() temp_predict_y, temp_accuracy, temp_predict_y_proba = svm_predict( temp_test_y_vector, temp_test_x_matrix, svm_model, '-b 1') temp_test_time = time.time() - start_time svm_train_time = svm_train_time + temp_test_time temp_accuracy, temp_precision, temp_recall, temp_f1_value, temp_tp, temp_fp, temp_tn, temp_fn = f1_value_precision_recall_accuracy( temp_predict_y, temp_test_y_vector) temp_predict_y = np.array(temp_predict_y) temp_predict_y_proba = np.array(temp_predict_y_proba) logger.info("Accuracy for class " + str(i) + ": " + str(temp_accuracy)) logger.info("Recall for class " + str(i) + ": " + str(temp_recall)) logger.info("Precision for class " + str(i) + ": " + str(temp_precision)) logger.info("F1 Score for class " + str(i) + ": " + str(temp_f1_value)) logger.info("Prediction matrix:") logger.info("TP=" + str(temp_tp) + " FP=" + str(temp_fp)) logger.info("TN=" + str(temp_tn) + " FN=" + str(temp_fn)) proba_row, proba_col = temp_predict_y_proba.shape svm_predict_matrix[:, i] = temp_predict_y svm_predict_proba[:, i] = temp_predict_y_proba[:, 1] logger.info('=============') #break svm_accuracy, svm_predict_y = predict_matrix_with_proba_to_predict_accuracy( svm_predict_matrix, svm_predict_proba, test_y_vector) return svm_accuracy, svm_train_time, svm_test_time, svm_predict_y
def run_feature_projected_cnn(train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, data_stru, cnn_setting, feature_dict, top_k, saver_file_profix='', class_id=-1, logger=None): if logger is None: logger = init_logging('') method = 'cnn' real_num_classes = data_stru.num_classes data_stru.num_classes = 2 cnn_setting.num_classes = 2 cnn_setting.feature_method = 'none' num_classes = 2 train_row, attr_len, attr_num, input_map = train_x_matrix.shape test_row, attr_len, attr_num, input_map = test_x_matrix.shape all_predict_matrix = np.zeros(test_row * real_num_classes).reshape( test_row, real_num_classes) saver_file = '' if class_id == -1: min_class = min(train_y_vector) max_class = max(train_y_vector) + 1 else: min_class = class_id max_class = class_id + 1 saver_file_profix = saver_file_profix + '_class' keep_saver_file = '' all_train_time = 0 all_test_time = 0 all_f1_value = [] all_train_time = [] all_test_time = [] for i in range(min_class, max_class): logger.info('class: ' + str(i)) temp_train_y_vector = np.where(train_y_vector == i, 1, 0) temp_test_y_vector = np.where(test_y_vector == i, 1, 0) class_saver_profix = saver_file_profix + str(i) fold_positive_len = len(np.where(temp_train_y_vector == 1)[0]) fold_negative_len = len(temp_train_y_vector) - fold_positive_len logger.info("=====") logger.info("positive class labels length: " + str(fold_positive_len)) logger.info("negative class labels length: " + str(fold_negative_len)) class_feature = feature_dict[i] class_feature = class_feature[0:top_k] print("class: " + str(i)) print("number of features: " + str(top_k)) print("Top features list: " + str(class_feature)) logger.info("Top feature list: " + str(class_feature)) temp_train_x_matrix = train_x_matrix[:, :, class_feature, :] temp_test_x_matrix = test_x_matrix[:, :, class_feature, :] temp_train_y_matrix = y_vector_to_matrix(temp_train_y_vector, num_classes) temp_test_y_matrix = y_vector_to_matrix(temp_test_y_vector, num_classes) if i == min_class: train_x_placeholder, output_y_placeholder, predict_y_prob, keep_prob_placeholder, keeped_feature_list, saver_file = cnn_set_flow_graph( data_stru, cnn_setting, input_map, False, logger) keep_saver_file = saver_file saver_file = cnn_setting.temp_obj_folder + class_saver_profix + keep_saver_file + "_top" + str( top_k) print saver_file class_eval_value, class_train_time, class_test_time, class_predict_prob, fold_saver_file, fold_obj_file = cnn_train( temp_train_x_matrix, temp_train_y_matrix, temp_test_x_matrix, temp_test_y_matrix, num_classes, cnn_setting, train_x_placeholder, output_y_placeholder, predict_y_prob, keep_prob_placeholder, keeped_feature_list, saver_file, logger) class_predict_y = np.argmax(class_predict_prob, axis=1) class_accuracy, precision, recall, class_f1, tp, fp, tn, fn = f1_value_precision_recall_accuracy( class_predict_y, temp_test_y_vector, 1) if str(class_eval_value) == 'nan': class_eval_value = 0 class_f1 = 0 logger.info(method + " f1 for class " + str(i) + ": " + str(class_f1)) logger.info(method + " accuracy for class " + str(i) + ": " + str(class_accuracy)) logger.info(method + ' model saved: ' + fold_saver_file) all_f1_value.append(class_f1) all_train_time.append(class_train_time) all_test_time.append(class_test_time) all_predict_matrix[:, i] = class_predict_prob[:, 1] #if i > 2: # break all_accuracy, all_predict_y = predict_matrix_with_prob_to_predict_accuracy( all_predict_matrix, test_y_vector) data_stru.num_classes = real_num_classes return all_accuracy, all_f1_value, all_predict_y, all_train_time, all_test_time, all_predict_matrix
def run_feature_rf_use_proba(train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, feature_array, attr_num, logger=None): if logger is None: logger = init_logging("") logger.info('no log file: ') num_classes, num_features = feature_array.shape test_row, test_col = test_x_matrix.shape rf_predict_matrix = np.zeros(test_row * num_classes).reshape( test_row, num_classes) rf_predict_proba = np.zeros(test_row * num_classes).reshape( test_row, num_classes) rf_train_time = 0 rf_test_time = 0 rf_accuracy = 0 proba = True for i in range(0, num_classes): logger.info("class: " + str(i)) logger.info(str(feature_array[i])) temp_train_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation( train_x_matrix, attr_num, feature_array[i]) temp_test_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation( test_x_matrix, attr_num, feature_array[i]) if i == 0: logger.info('sub feature data shape: ') logger.info(str(temp_train_x_matrix.shape)) logger.info(str(temp_test_x_matrix.shape)) temp_train_y_vector = np.where(train_y_vector == i, 1, 0) temp_test_y_vector = np.where(test_y_vector == i, 1, 0) temp_accuracy, temp_predict_y, temp_predict_y_proba, temp_train_time, temp_test_time = run_rf( temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix, temp_test_y_vector, 20, True) temp_accuracy, temp_precision, temp_recall, temp_f1_value, temp_tp, temp_fp, temp_tn, temp_fn = f1_value_precision_recall_accuracy( temp_predict_y, temp_test_y_vector) logger.info("Accuracy for class " + str(i) + ": " + str(temp_accuracy)) logger.info("Recall for class " + str(i) + ": " + str(temp_recall)) logger.info("Precision for class " + str(i) + ": " + str(temp_precision)) logger.info("F1 Score for class " + str(i) + ": " + str(temp_f1_value)) logger.info("Prediction matrix:") logger.info("TP=" + str(temp_tp) + " FP=" + str(temp_fp)) logger.info("TN=" + str(temp_tn) + " FN=" + str(temp_fn)) rf_train_time = rf_train_time + temp_train_time rf_test_time = rf_test_time + temp_test_time proba_row, proba_col = temp_predict_y_proba.shape rf_predict_matrix[:, i] = temp_predict_y rf_predict_proba[:, i] = temp_predict_y_proba[:, 1] logger.info('=============') #break rf_accuracy, rf_predict_y = predict_matrix_with_proba_to_predict_accuracy( rf_predict_matrix, rf_predict_proba, test_y_vector) return rf_accuracy, rf_train_time, rf_test_time, rf_predict_y
#k = 2 #clever_rank(train_x_matrix, k, threshold) #dcpc = computeDCPC(train_x_matrix, threshold) #clever_cluster(dcpc, k) #sdfs data_keyword = 'dsa' data_keyword = 'rar' data_keyword = 'arc' data_keyword = 'asl' data_keyword = 'fixed_arc' data_folder = '../../data/' + data_keyword + '/train_test_10_fold/' data_folder = '../../data/' + data_keyword + '/train_test_3_fold/' data_folder = '../../data/' + data_keyword + '/train_test_1_fold/' class_column = 0 num_classes = 18 threshold = 0.9 obj_folder = '../../object/' + data_keyword + '/tkde_2005_dcpc/' obj_folder = init_folder(obj_folder) log_folder = '../../log/' + data_keyword + '/tkde_2005/' log_folder = init_folder(log_folder) #log_file = log_folder + data_keyword + "_tkde_dcpc.log" #logger = init_logging(log_file) #run_dcpc_main(data_folder, class_column, num_classes, obj_folder, threshold, logger) method = 0 log_file = log_folder + data_keyword + "_dcpc_to_score.log" logger = init_logging(log_file) run_dcpc_processing(obj_folder, num_classes, method, logger)
def cnn_train(train_x_matrix, train_y_matrix, test_x_matrix, test_y_matrix, num_classes, cnn_setting, input_x_placeholder, output_y_placeholder, logits_out, keep_prob, keeped_feature_list, saver_file="./", logger=None): if logger is None: logger = init_logging('') min_class = 0 eval_method = cnn_setting.eval_method batch_size = cnn_setting.batch_size stop_threshold = cnn_setting.stop_threshold max_iter = cnn_setting.max_iter feature_method = cnn_setting.feature_method feature_obj_file = cnn_setting.out_obj_folder + saver_file saver_file = cnn_setting.out_model_folder + saver_file predict_y_proba = tf.nn.softmax(logits_out) prediction = tf.argmax(predict_y_proba, 1) actual = tf.argmax(output_y_placeholder, 1) correct_prediction = tf.equal(prediction, actual) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) if eval_method == 'f1': train_y_vector = np.argmax(train_y_matrix, axis=1) train_class_index_dict, train_min_length, train_max_length = class_label_vector_checking( train_y_vector) min_class = 0 max_class = max(train_y_vector) num_classes = max_class + 1 if max_class == 1: TP = tf.count_nonzero(prediction * actual, dtype=tf.float32) TN = tf.count_nonzero((prediction - 1) * (actual - 1), dtype=tf.float32) FP = tf.count_nonzero(prediction * (actual - 1), dtype=tf.float32) FN = tf.count_nonzero((prediction - 1) * actual, dtype=tf.float32) precision = (TP) / (TP + FP) recall = (TP) / (TP + FN) f1 = (2 * precision * recall) / (precision + recall) eval_method_value = f1 eval_method_keyword = "f1" else: eval_method_value = accuracy eval_method_keyword = "acc with batch" coefficient_placeholder = tf.placeholder(tf.float32, shape=[num_classes]) cross_entropy = tf.reduce_mean( tf.nn.weighted_cross_entropy_with_logits( targets=output_y_placeholder, logits=logits_out, pos_weight=coefficient_placeholder)) else: cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( labels=output_y_placeholder, logits=logits_out)) eval_method_value = accuracy eval_method_keyword = "acc" #print cross_entropy.get_shape() train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy) cnn_session = tf.InteractiveSession() cnn_session.run(tf.global_variables_initializer()) test_eval_value = 0 best_eval_value = 0 i = 0 start = 0 epoch = 0 end = batch_size batch_each_class = int(batch_size / num_classes) overall_len = len(train_y_matrix) saver = tf.train.Saver() train_run_time = 0 np.random.seed(epoch) batch_index = np.random.permutation(overall_len) logger.info("Random Epoch:" + str(epoch) + str(batch_index[0:5])) f1_unbalance_count = np.zeros(num_classes) second_chance = False re_init = False while (test_eval_value < stop_threshold): if start >= overall_len: start = 0 end = start + batch_size epoch = epoch + 1 np.random.seed(epoch) logger.info("Random Epoch:" + str(epoch) + str(batch_index[0:5])) batch_index = np.random.permutation(overall_len) elif end > overall_len: end = overall_len batch_x_matrix = train_x_matrix[batch_index[start:end], :, :, :] batch_y_matrix = train_y_matrix[batch_index[start:end], :] #print 'batch_x_matrix shape' #print batch_x_matrix.shape #print batch_y_matrix.shape if eval_method == 'f1': if i == 0: logger.info("Batch controlled") ### Normal BATCH Weight #batch_y_vector = np.argmax(batch_y_matrix, axis=1) #batch_class_index_dict, batch_min_length, batch_max_length = class_label_vector_checking(batch_y_vector) #coefficients_vector = [] #batch_class_index_dict_keys = batch_class_index_dict.keys() #for c_label in range(min_class, max_class+1): # if c_label not in batch_class_index_dict_keys: # add_index_vector_len = 0.1 # else: # add_index_vector_len = len(batch_class_index_dict[c_label]) # coefficients_vector.append(float(batch_max_length)/float(add_index_vector_len)) #coefficients_vector = np.array(coefficients_vector) ### End of Normal BATCH Weight # BATCH_CONTROLLED batch_y_vector = np.argmax(batch_y_matrix, axis=1) batch_class_index_dict, batch_min_length, batch_max_length = class_label_vector_checking( batch_y_vector) if i < 3: logger.info("class index before: ") logger.info(batch_class_index_dict) coefficients_vector = [] batch_class_index_dict_keys = batch_class_index_dict.keys() for c_label in range(min_class, max_class + 1): #print "class: " + str(c_label) #print class_label_vector_checking if c_label not in batch_class_index_dict_keys: f1_unbalance_count[ c_label] = f1_unbalance_count[c_label] + 1 c_label_index = train_class_index_dict[c_label] c_label_index_len = len(c_label_index) add_index_vector_len = 0 if c_label_index_len > batch_each_class: add_index_vector = np.random.choice(c_label_index_len, batch_each_class, replace=False) if (i < 3): logger.info("add index vector for c " + str(c_label)) logger.info(add_index_vector) add_index_vector_len = len(add_index_vector) batch_x_matrix = np.concatenate( (batch_x_matrix, train_x_matrix[ c_label_index[add_index_vector], :, :, :]), axis=0) batch_y_matrix = np.concatenate( (batch_y_matrix, train_y_matrix[c_label_index[add_index_vector], :] ), axis=0) else: batch_x_matrix = np.concatenate( (batch_x_matrix, train_x_matrix[c_label_index, :, :, :]), axis=0) batch_y_matrix = np.concatenate( (batch_y_matrix, train_y_matrix[c_label_index, :]), axis=0) add_index_vector_len = c_label_index_len else: batch_class_index = batch_class_index_dict[c_label] add_index_vector_len = len(batch_class_index) c_label_index = train_class_index_dict[c_label] c_label_index_len = len(c_label_index) if add_index_vector_len < batch_each_class: add_count = batch_each_class - add_index_vector_len if c_label_index_len > add_count: add_index_vector = np.random.choice( c_label_index_len, add_count, replace=False) if (i < 3): logger.info("add index vector for c " + str(c_label)) logger.info(add_index_vector) add_index_vector_len = add_index_vector_len + len( add_index_vector) batch_x_matrix = np.concatenate( (batch_x_matrix, train_x_matrix[ c_label_index[add_index_vector], :, :, :]), axis=0) batch_y_matrix = np.concatenate( (batch_y_matrix, train_y_matrix[ c_label_index[add_index_vector], :]), axis=0) else: batch_x_matrix = np.concatenate( (batch_x_matrix, train_x_matrix[c_label_index, :, :, :]), axis=0) batch_y_matrix = np.concatenate( (batch_y_matrix, train_y_matrix[c_label_index, :]), axis=0) add_index_vector_len = add_index_vector_len + c_label_index_len elif add_index_vector_len > 2 * batch_each_class: remove_count = (add_index_vector_len - 2 * batch_each_class) remove_index_vector = np.random.choice( batch_class_index, remove_count, replace=False) add_index_vector_len = add_index_vector_len - len( remove_index_vector) batch_x_matrix = np.delete(batch_x_matrix, remove_index_vector, axis=0) batch_y_matrix = np.delete(batch_y_matrix, remove_index_vector, axis=0) batch_y_vector = np.argmax(batch_y_matrix, axis=1) batch_class_index_dict, batch_min_length, batch_max_length = class_label_vector_checking( batch_y_vector) coefficients_vector.append(float(add_index_vector_len)) #print "End of F1" coefficients_vector = np.array(coefficients_vector) batch_max_len = float(max(coefficients_vector)) coefficients_vector = batch_max_len / coefficients_vector if i < 3: batch_y_vector = np.argmax(batch_y_matrix, axis=1) batch_class_index_dict, batch_min_length, batch_max_length = class_label_vector_checking( batch_y_vector) logger.info("class index after: ") logger.info(batch_class_index_dict) logger.info("coefficient vector: ") logger.info(coefficients_vector) start_time = time.time() train_step.run( feed_dict={ input_x_placeholder: batch_x_matrix, output_y_placeholder: batch_y_matrix, coefficient_placeholder: coefficients_vector, keep_prob: 1 }) train_run_time = train_run_time + time.time() - start_time else: start_time = time.time() train_step.run( feed_dict={ input_x_placeholder: batch_x_matrix, output_y_placeholder: batch_y_matrix, keep_prob: 1 }) train_run_time = train_run_time + time.time() - start_time if i % 100 == 0: fir_weight_variable = tf.get_default_graph().get_tensor_by_name( "conv_w_0:0") logger.info("fir weight") logger.info(fir_weight_variable.get_shape()) fir_weight_var_val = cnn_session.run(fir_weight_variable) logger.info(fir_weight_var_val[0, 0:5, 0, 0]) test_eval_value = eval_method_value.eval( feed_dict={ input_x_placeholder: test_x_matrix, output_y_placeholder: test_y_matrix, keep_prob: 1 }) if str(test_eval_value) == 'nan': test_eval_value = 0 print_str = "step " + str( i) + ", testing " + eval_method_keyword + ": " + str( test_eval_value) logger.info(print_str) if best_eval_value < test_eval_value: # Save the variables to disk. best_eval_value = test_eval_value save_path = saver.save(cnn_session, saver_file) print_str = "Model saved in file: " + save_path + ' at iteration: ' + str( i) logger.info(print_str) i = i + 1 start = end end = end + batch_size if epoch > max_iter: logger.info("best eval value at epoch: " + str(epoch)) logger.info("best eval value to break") logger.info(best_eval_value) break start_time = time.time() test_eval_value = eval_method_value.eval( feed_dict={ input_x_placeholder: test_x_matrix, output_y_placeholder: test_y_matrix, keep_prob: 1 }) test_run_time = time.time() - start_time if test_eval_value < best_eval_value: cnn_session.close() cnn_session = tf.InteractiveSession() saver.restore(cnn_session, saver_file) else: best_eval_value = test_eval_value #if best_eval_value == 0: # return logger.info("Running iteration: %d" % (i)) logger.info("final best " + eval_method_keyword + ": " + str(best_eval_value)) logger.info(f1_unbalance_count) cnn_predict_proba = cnn_session.run(predict_y_proba, feed_dict={ input_x_placeholder: test_x_matrix, keep_prob: 1.0 }) logger.info("CNN model saved: " + str(saver_file)) if cnn_setting.feature_method == 'none': cnn_session.close() return best_eval_value, train_run_time, test_run_time, cnn_predict_proba, saver_file, '' #keeped_feature_value_list = [] logger.info("feature value generation") #for feature_placeholder in keeped_feature_list: # feature_value = feature_placeholder.eval(feed_dict={input_x_placeholder: train_x_matrix, keep_prob: 1.0}) # keeped_feature_value_list.append(feature_value) # logger.info(feature_value.shape) test_keeped_feature_value_list = cnn_session.run(keeped_feature_list, feed_dict={ input_x_placeholder: test_x_matrix, keep_prob: 1.0 }) logger.info('test feature list ready') start = 0 end = 0 train_row = len(train_x_matrix) train_obj_list = [] while (start < train_row): logger.info(start) end = start + 1000 if end > train_row: end = train_row keep_obj = cnn_session.run(keeped_feature_list[0], feed_dict={ input_x_placeholder: train_x_matrix[start:end, :, :, :], keep_prob: 1.0 }) train_obj_list.append(keep_obj) start = end #keeped_feature_value_list = cnn_session.run(keeped_feature_list, feed_dict={input_x_placeholder: train_x_matrix, keep_prob: 1.0}) logger.info('train feature list ready') logger.info( "The order of feature value list: fir_out_conv_no_act, fir_out_conv, fir_weight, fir_bias, last_conv, weight_full, bias_full" ) logger.info("All features saved to ") logger.info("CNN feature list saved to: " + feature_obj_file) save_obj([train_obj_list, test_keeped_feature_value_list], feature_obj_file) cnn_session.close() return best_eval_value, train_run_time, test_run_time, cnn_predict_proba, saver_file, feature_obj_file
#y, x = svm_read_problem('/Users/ivan/Platform/python/libsvm-3.22/heart_scale') # # # #train_x_matrix = np.array(x[0:10]) #train_y_vector = np.array(y[0:10]).astype(np.float64) # #test_x_matrix = np.array(x[10:100]) #test_y_vector = np.array(y[10:100]).astype(np.float64) #print train_y_vector #distance_matrix, predict_y_vector, train_time, test_time = run_knn_with_dist(train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, n_neighbors) save_file = 'test.model' logger = init_logging('') accuracy, predict_y, predict_y_proba, train_time, test_time = run_libsvm( train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, logger, True, save_file) sdfs model = svm_load_model(save_file) predict_y, predict_acc, predict_val = svm_predict(test_y_vector.tolist(), test_x_matrix.tolist(), model, '-b 1') print predict_acc print predict_y print predict_val
def cnn_configure(train_x_placeholder, cnn_setting, num_classes, group_all=False, logger=None): if logger is None: logger = init_logging('') # CNN Parameters conv_kernel_list = cnn_setting.conv_kernel_list pool_rate_list = cnn_setting.pool_rate_list feature_num_list = cnn_setting.feature_num_list activation_fun = cnn_setting.activation_fun std_value = cnn_setting.std_value same_size = cnn_setting.same_size cnn_group_list = cnn_setting.group_list conv_row_num = len(conv_kernel_list) saver_file = '' keeped_feature_list = [] num_input_map = cnn_setting.input_map strides_list = [1, 1, 1, 1] for i in range(0, conv_row_num): logger.info('layer: ' + str(i) + " input:") logger.info(train_x_placeholder.get_shape()) conv_row_kernel = conv_kernel_list[i, 0] conv_col_kernel = conv_kernel_list[i, 1] train_x_row = int(train_x_placeholder.get_shape()[1]) train_x_col = int(train_x_placeholder.get_shape()[2]) if conv_row_kernel < 0: conv_row_kernel = train_x_row elif conv_row_kernel > train_x_row: conv_row_kernel = train_x_row num_output_map = feature_num_list[i] if i == 0 and group_all == True: conv_col_kernel = train_x_col elif conv_col_kernel > train_x_col: conv_col_kernel = train_x_col elif conv_col_kernel < 0: conv_col_kernel = train_x_col saver_file = saver_file + "_c" + str(conv_row_kernel) + "_" + str( conv_col_kernel) #activation_fun = 3 #print i, conv_row_kernel, conv_col_kernel, train_x_placeholder, num_input_map, num_output_map, activation_fun, strides_list, std_value, same_size out_conv = conf_conv_layer(i, conv_row_kernel, conv_col_kernel, train_x_placeholder, num_input_map, num_output_map, activation_fun, strides_list, std_value, same_size, logger) logger.info("Conv output: " + str(out_conv.get_shape())) pool_row_kernel = pool_rate_list[i, 0] pool_col_kernel = pool_rate_list[i, 1] saver_file = saver_file + "_p" + str(pool_row_kernel) + "_" + str( pool_col_kernel) out_conv_row = int(out_conv.get_shape()[1]) out_conv_col = int(out_conv.get_shape()[2]) if pool_row_kernel > 0 and pool_col_kernel > 0: if pool_row_kernel > out_conv_row: warning_str = "Warning: given pooling row number " + str(pool_row_kernel) + \ " is bigger than the data row number " + str(out_conv_row) logger.info(warning_str) warning_str = "Setting the pooling row number to be the data row number" logger.info(warning_str) pool_row_kernel = out_conv_row if pool_col_kernel > out_conv_col: warning_str = "Warning: given pooling column number " + \ str(pool_col_kernel) + \ " is bigger than the data column number " + \ str(out_conv_row) logger.info(warning_str) warning_str = "Setting the pooling column number to be the data column number" logger.info(warning_str) pool_col_kernel = out_conv_col train_x_placeholder = conf_pool_layer(out_conv, pool_row_kernel, pool_col_kernel, same_size) logger.info("Pooling output: " + str(train_x_placeholder.get_shape())) else: train_x_placeholder = out_conv num_input_map = num_output_map saver_file = saver_file + '.ckpt' ############################################# # typical full connect layer # print final_conv_kernel last_out_conv = train_x_placeholder # Only save the matrix before fully connected layer keeped_feature_list.append(last_out_conv) logger.info("Feature result shape") logger.info(last_out_conv.get_shape()) #print "last out conv" #print last_out_conv.get_shape() second_feature_num = int(last_out_conv.get_shape()[1] * last_out_conv.get_shape()[2] * last_out_conv.get_shape()[3]) output_feature_num = 400 #std_value = sqrt(2.0 / second_feature_num) #std_value = 0.02 tf.random.set_random_seed(0) weight_fullconn = tf.Variable( tf.truncated_normal([second_feature_num, output_feature_num], stddev=std_value)) logger.info("full conn weight shape") logger.info(weight_fullconn.get_shape()) bias_fullconn = tf.Variable( tf.constant(std_value, shape=[output_feature_num])) #keeped_feature_list.append(weight_fullconn) #keeped_feature_list.append(bias_fullconn) h_pool2_flat = tf.reshape(last_out_conv, [-1, second_feature_num]) output_fullconn_no_act = tf.matmul(h_pool2_flat, weight_fullconn) + bias_fullconn output_fullconn = tf.nn.relu(output_fullconn_no_act) logger.info('last full connect layer output:') logger.info(str(output_fullconn.get_shape())) #dropout keep_prob_placeholder = tf.placeholder(tf.float32) output_fullconn_drop = tf.nn.dropout(output_fullconn, keep_prob_placeholder) logits_out = conf_out_layer(output_fullconn_drop, output_feature_num, num_classes, std_value) #print "logits_out" #print logits_out.get_shape() return logits_out, keep_prob_placeholder, keeped_feature_list, saver_file
def run_feature_knn_use_proba(train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, feature_array, attr_num, n_neighbors, class_id=-1, logger=None): if logger is None: logger = init_logging("") logger.info('no log file: ') num_classes, num_features = feature_array.shape test_row, test_col = test_x_matrix.shape knn_predict_matrix = np.zeros(test_row * num_classes).reshape( test_row, num_classes) knn_predict_proba = np.zeros(test_row * num_classes).reshape( test_row, num_classes) knn_train_time = 0 knn_test_time = 0 knn_accuracy = 0 proba = True if class_id == -1: min_class = min(train_y_vector) max_class = max(train_y_vector) + 1 else: min_class = class_id max_class = class_id + 1 #result_matrix = np.zeros((10, num_classes)) for i in range(min_class, max_class): logger.info('class: ' + str(i)) logger.info(str(feature_array[i])) #print 'class: ' + str(i) #print feature_array[i] temp_train_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation( train_x_matrix, attr_num, feature_array[i]) temp_test_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation( test_x_matrix, attr_num, feature_array[i]) #print 'class: ' + str(i) temp_train_y_vector = np.where(train_y_vector == i, 1, 0) temp_test_y_vector = np.where(test_y_vector == i, 1, 0) if i == 0: logger.info('sub feature data shape: ') logger.info(str(temp_train_x_matrix.shape)) logger.info(str(temp_test_x_matrix.shape)) #print 'sub feature data shape:' #print temp_train_x_matrix.shape #print temp_test_x_matrix.shape temp_accuracy, temp_predict_y, temp_predict_y_proba, temp_train_time, temp_test_time = run_knn( temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix, temp_test_y_vector, n_neighbors, proba) #temp_predict_y_proba, temp_predict_y, temp_train_time, temp_test_time = run_knn_with_dist(temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix, temp_test_y_vector) #temp_accuracy_1, temp_precision, temp_recall, temp_f1_value = f1_value_precision_recall_accuracy(temp_predict_y, temp_test_y_vector) temp_accuracy, temp_precision, temp_recall, temp_f1_value, temp_tp, temp_fp, temp_tn, temp_fn = f1_value_precision_recall_accuracy( temp_predict_y, temp_test_y_vector) #if temp_accuracy != temp_accuracy_1: # logger.info(str(temp_accuracy)) # logger.info(str(temp_accuracy_1)) # #print temp_accuracy # #print temp_accuracy_1 # raise Exception("Two accuracy results are not the same") #result_matrix[0, i] = temp_accuracy #result_matrix[1, i] = temp_precision #result_matrix[2, i] = temp_recall #result_matrix[3, i] = temp_f1_value #result_matrix[4, i] = temp_tp #result_matrix[5, i] = temp_fp #result_matrix[6, i] = temp_tn #result_matrix[7, i] = temp_fn #result_matrix[8, i] = temp_train_time #result_matrix[9, i] = temp_test_time logger.info("Accuracy for class " + str(i) + ": " + str(temp_accuracy)) logger.info("Recall for class " + str(i) + ": " + str(temp_recall)) logger.info("Precision for class " + str(i) + ": " + str(temp_precision)) logger.info("F1 Score for class " + str(i) + ": " + str(temp_f1_value)) logger.info("Prediction matrix:") logger.info("TP=" + str(temp_tp) + " FP=" + str(temp_fp)) logger.info("TN=" + str(temp_tn) + " FN=" + str(temp_fn)) knn_train_time = knn_train_time + temp_train_time knn_test_time = knn_test_time + temp_test_time proba_row, proba_col = temp_predict_y_proba.shape knn_predict_matrix[:, i] = temp_predict_y_proba[:, 1] logger.info('=============') #break knn_accuracy, knn_predict_y = predict_matrix_with_proba_to_predict_accuracy( knn_predict_matrix, knn_predict_matrix, test_y_vector) return knn_accuracy, knn_train_time, knn_test_time, knn_predict_y
def run_feature_projected_classification(train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, feature_array, top_k, method, class_id=-1, logger=None): if logger is None: logger = init_logging('') train_row, attr_len, attr_num, input_map = train_x_matrix.shape test_row, attr_len, attr_num, input_map = test_x_matrix.shape real_num_classes, attr_num = feature_array.shape all_predict_matrix = np.zeros(test_row * real_num_classes).reshape( test_row, real_num_classes) feature_col = attr_len * top_k * input_map if class_id == -1: min_class = min(train_y_vector) max_class = max(train_y_vector) + 1 else: min_class = class_id max_class = class_id + 1 n_neighbors = 1 samples_leaf = 20 prob = True all_f1_value = [] all_train_time = [] all_test_time = [] #min_class = 9 for i in range(min_class, max_class): logger.info('class: ' + str(i)) temp_train_y_vector = np.where(train_y_vector == i, 1, 0) temp_test_y_vector = np.where(test_y_vector == i, 1, 0) fold_positive_len = len(np.where(temp_train_y_vector == 1)[0]) fold_negative_len = len(temp_train_y_vector) - fold_positive_len logger.info("=====") logger.info("positive class labels length: " + str(fold_positive_len)) logger.info("negative class labels length: " + str(fold_negative_len)) class_feature = feature_array[i] class_feature = class_feature[0:top_k] logger.info("feature list: " + str(class_feature)) temp_train_x_matrix = train_x_matrix[:, :, class_feature, :] temp_test_x_matrix = test_x_matrix[:, :, class_feature, :] temp_train_x_matrix = temp_train_x_matrix.reshape( train_row, feature_col) temp_test_x_matrix = temp_test_x_matrix.reshape(test_row, feature_col) if method == 'knn': class_accuracy, class_predict_y, class_predict_prob, class_train_time, class_test_time = run_knn( temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix, temp_test_y_vector, n_neighbors, prob) elif method == 'rf': class_accuracy, class_predict_y, class_predict_prob, class_train_time, class_test_time = run_rf( temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix, temp_test_y_vector, samples_leaf, prob) elif method == 'libsvm': class_accuracy, class_predict_y, class_predict_prob, class_train_time, class_test_time = run_libsvm( temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix, temp_test_y_vector, logger, prob, '', True) class_accuracy, precision, recall, class_f1, tp, fp, tn, fn = f1_value_precision_recall_accuracy( class_predict_y, temp_test_y_vector, 1) logger.info(method + " f1 for class " + str(i) + ": " + str(class_f1)) logger.info(method + " accuracy for class " + str(i) + ": " + str(class_accuracy)) all_f1_value.append(class_f1) all_train_time.append(class_train_time) all_test_time.append(class_test_time) all_predict_matrix[:, i] = class_predict_prob[:, 1] #if i > 2: # break all_accuracy, all_predict_y = predict_matrix_with_prob_to_predict_accuracy( all_predict_matrix, test_y_vector) return all_accuracy, all_f1_value, all_predict_y, all_train_time, all_test_time, all_predict_matrix
def run_feature_svm_use_proba(train_x_matrix, train_y_vector, test_x_matrix, test_y_vector, feature_array, attr_num, logger=None, save_pre=''): if logger is None: logger = init_logging("") logger.info('no log file: ') num_classes, num_features = feature_array.shape test_row, test_col = test_x_matrix.shape svm_predict_matrix = np.zeros(test_row * num_classes).reshape( test_row, num_classes) svm_predict_proba = np.zeros(test_row * num_classes).reshape( test_row, num_classes) svm_train_time = 0 svm_test_time = 0 svm_accuracy = 0 proba = True banlanced_ratio = 5 for i in range(0, num_classes): #print 'class: ' + str(i) #print feature_array[i] logger.info("class: " + str(i)) logger.info(str(feature_array[i])) temp_train_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation( train_x_matrix, attr_num, feature_array[i]) temp_test_x_matrix, temp_attr_num, temp_attr_len = feature_data_generation( test_x_matrix, attr_num, feature_array[i]) #print 'class: ' + str(i) if i == 0: logger.info('sub feature data shape: ') logger.info(str(temp_train_x_matrix.shape)) logger.info(str(temp_test_x_matrix.shape)) temp_train_y_vector = np.where(train_y_vector == i, 1, 0) temp_test_y_vector = np.where(test_y_vector == i, 1, 0) temp_train_x_matrix, temp_train_y_vector = banlanced_binary_processing( temp_train_x_matrix, temp_train_y_vector, banlanced_ratio) save_file = save_pre + "_class" + str(i) + "_top" + str( temp_attr_num) + ".model" logger.info('svm saved to ' + save_file) #temp_accuracy, temp_predict_y, temp_predict_y_proba, temp_train_time, temp_test_time = run_libsvm(temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix, temp_test_y_vector, logger, proba, save_file) temp_accuracy, temp_predict_y, temp_predict_y_proba, temp_train_time, temp_test_time = run_sklearn_libsvm( temp_train_x_matrix, temp_train_y_vector, temp_test_x_matrix, temp_test_y_vector, logger, proba) temp_accuracy, temp_precision, temp_recall, temp_f1_value, temp_tp, temp_fp, temp_tn, temp_fn = f1_value_precision_recall_accuracy( temp_predict_y, temp_test_y_vector) temp_predict_y = np.array(temp_predict_y) temp_predict_y_proba = np.array(temp_predict_y_proba) logger.info("Accuracy for class " + str(i) + ": " + str(temp_accuracy)) logger.info("Recall for class " + str(i) + ": " + str(temp_recall)) logger.info("Precision for class " + str(i) + ": " + str(temp_precision)) logger.info("F1 Score for class " + str(i) + ": " + str(temp_f1_value)) logger.info("Prediction matrix:") logger.info("TP=" + str(temp_tp) + " FP=" + str(temp_fp)) logger.info("TN=" + str(temp_tn) + " FN=" + str(temp_fn)) svm_train_time = svm_train_time + temp_train_time svm_test_time = svm_test_time + temp_test_time proba_row, proba_col = temp_predict_y_proba.shape svm_predict_matrix[:, i] = temp_predict_y svm_predict_proba[:, i] = temp_predict_y_proba[:, 1] logger.info('=============') #break svm_accuracy, svm_predict_y = predict_matrix_with_proba_to_predict_accuracy( svm_predict_matrix, svm_predict_proba, test_y_vector) return svm_accuracy, svm_train_time, svm_test_time, svm_predict_y