def fea_weight_write(path, fn, \ feature_weight_list, exc_fun_label, fidx, \ sort_flag = False, reverse_flag = False): """ write features ranking to specify file.(path + fn). also the path (path) will write to file in feature_weight_file_path. Input ---- path: {str} the path where to write feature ranking. fn: {str} file name. feature_weight_list: {list}, len is m, each element is numpy array shape {n_features,}. exc_fun_label: {numpy array}, shape {m,}. the index for feature ranking fidx: {numpy array}, shape {n_features,} Output ------ None """ num_fea = 0 if len(feature_weight_list) > 0: num_fea = len(feature_weight_list[0]) # new_path = path + "/n{0}/".format(num_fea) new_path = path + "/" create_path(new_path) with open(feature_weight_file_path, "a+") as f: if new_path.strip('/') not in f.readlines(): print(new_path, file=f) feature_weight_table_path = new_path + fn + '.csv' feature_weight_table = pd.DataFrame(data=np.array(feature_weight_list), index=exc_fun_label, columns=fidx) feature_weight_table.index.name = 'index name' print('write : ', feature_weight_table_path) if feature_weight_table_path != None: feature_weight_table.to_csv(feature_weight_table_path, header=True, index=True) plot_table = feature_weight_table if sort_flag == True: arr = plot_table.values new_arr = np.sort(arr, axis=1) if reverse_flag == True: for i in range(len(arr)): new_arr[i, :] = new_arr[i, ::-1] plot_table = pd.DataFrame(np.array(new_arr), index=feature_weight_table.index, columns=feature_weight_table.columns) plot_table.index.name = feature_weight_table.index.name plot_acc_arr(plot_table, picture_path=new_path + '/' + fn + '.png')
def save_time(fn, fun_name, save_value): time_file_name = 'exec_time.csv' columns = ['data name', 'fun name', 'which', 'time'] time_table = pd.DataFrame(columns=columns) time_table.index.name = 'index name' if path_isExists(time_file_name): time_table = pd.read_csv(time_file_name) for which in save_value: t_table = pd.DataFrame(np.array( [fn, fun_name, which, save_value[which]]).reshape(1, -1), columns=columns) value_dict = {columns[0]: fn, columns[1]: fun_name, columns[2]: which} flag = time_table.apply(lambda x: time_isExit(x, value_dict), axis=1) if flag.shape[0] == 0 or (flag.shape[0] != 0 and not flag.any()): time_table = time_table.append(t_table, ignore_index=True) elif flag.shape[0] != 0 and flag.any(): # print("equal") if time_table.ix[flag, columns[3]] is np.NaN: time_table.ix[flag, columns[3]] = 0 time_table.ix[flag, columns[3]] = (time_table.ix[flag, columns[3]] + save_value[which]) / 2.0 # print(time_table.ix[flag, :]) time_table.to_csv(time_file_name, index=False)
def cal_many_acc_by_idx(x_train, y_train, x_test, y_test, \ feature_order, idx_array, run_num = 10): """ calculate accuracy according feature number in idx_array Input ----- x_train: {numpy array}, shape {n_samples, n_features} y_train: {numpy array}, shape {n_samples,} x_test: {numpy array}, shape {n2_samples, n_features} y_test: {numpy array}, shape {n2_samples,} feature_order: {numpy array}, shape {n_features,} idx_array: {numpy array}, shape {n,} Output ------ acc_array: {numpy array}, shape {n,} """ idx_array = np.array(idx_array) acc_array = np.zeros(idx_array.shape) for i, num_fea in enumerate(idx_array): idx = feature_order[:num_fea] new_x_train, new_x_test = x_train[:, idx], x_test[:, idx] new_y_train, new_y_test = y_train, y_test a = run_acc(new_x_train, new_y_train, new_x_test, new_y_test, run_num=run_num) acc_array[i] = a return acc_array
def save_objectv(arr, name, output_path, sort_flag=False, reverse_flag=False): create_path(output_path) table = pd.DataFrame(np.array(arr).reshape(1, -1), index=[name]) table.index.name = 'index name' table.to_csv(output_path + '/' + name + '.csv', header=True, index=True) plot_table = table if sort_flag == True: new_arr = sorted(arr) if reverse_flag == True: new_arr = new_arr[::-1] plot_table = pd.DataFrame(np.array(new_arr).reshape(1, -1), index=[name]) plot_table.index.name = 'index name' plot_acc_arr(plot_table, xlabel="iter", ylabel='value', picture_path=output_path + '/' + name + '.png')
def fea_rank_write(path, fn, \ feature_order_list, exc_fun_label, fidx): """ write features ranking to specify file.(path + fn). also the path (path) will write to file in feature_ranking_file_path. Input ---- path: {str} the path where to write feature ranking. fn: {str} file name. feature_order_list: {list}, len is m, each element is numpy array shape {n_features,}. exc_fun_label: {numpy array}, shape {m,}. the index for feature ranking fidx: {numpy array}, shape {n_features,} Output ------ None """ num_fea = 0 if len(feature_order_list) > 0: num_fea = len(feature_order_list[0]) new_path = path + "/n{0}/".format(num_fea) create_path(new_path) with open(feature_ranking_file_path, "a+") as f: if new_path not in f.readlines(): print(new_path, file=f) feature_order_table_path = new_path + fn feature_order_table = pd.DataFrame(data=np.array(feature_order_list), index=exc_fun_label, columns=fidx) feature_order_table.index.name = 'index name' print('write : ', feature_order_table_path) if feature_order_table_path != None: feature_order_table.to_csv(feature_order_table_path, header=True, index=True)
def my_normalized_mutual_info_score(labels_true, labels_pred): """Normalized Mutual Information between two clusterings 这个函数修改了熵的部分:np.sqrt(h_true * h_pred) =》 max(h_true, h_pred) Normalized Mutual Information (NMI) is an normalization of the Mutual Information (MI) score to scale the results between 0 (no mutual information) and 1 (perfect correlation). In this function, mutual information is normalized by ``sqrt(H(labels_true) * H(labels_pred))`` This measure is not adjusted for chance. Therefore :func:`adjusted_mustual_info_score` might be preferred. This metric is independent of the absolute values of the labels: a permutation of the class or cluster label values won't change the score value in any way. This metric is furthermore symmetric: switching ``label_true`` with ``label_pred`` will return the same score value. This can be useful to measure the agreement of two independent label assignments strategies on the same dataset when the real ground truth is not known. Read more in the :ref:`User Guide <mutual_info_score>`. Parameters ---------- labels_true : int array, shape = [n_samples] A clustering of the data into disjoint subsets. labels_pred : array, shape = [n_samples] A clustering of the data into disjoint subsets. Returns ------- nmi: float score between 0.0 and 1.0. 1.0 stands for perfectly complete labeling See also -------- adjusted_rand_score: Adjusted Rand Index adjusted_mutual_info_score: Adjusted Mutual Information (adjusted against chance) Examples -------- Perfect labelings are both homogeneous and complete, hence have score 1.0:: >>> from sklearn.metrics.cluster import normalized_mutual_info_score >>> normalized_mutual_info_score([0, 0, 1, 1], [0, 0, 1, 1]) 1.0 >>> normalized_mutual_info_score([0, 0, 1, 1], [1, 1, 0, 0]) 1.0 If classes members are completely split across different clusters, the assignment is totally in-complete, hence the NMI is null:: >>> normalized_mutual_info_score([0, 0, 0, 0], [0, 1, 2, 3]) 0.0 """ # 一维,且要维度相同 labels_true, labels_pred = check_clusterings(labels_true, labels_pred) classes = np.unique(labels_true) clusters = np.unique(labels_pred) # Special limit cases: no clustering since the data is not split. # This is a perfect match hence return 1.0. if (classes.shape[0] == clusters.shape[0] == 1 or classes.shape[0] == clusters.shape[0] == 0): return 1.0 contingency = contingency_matrix(labels_true, labels_pred) contingency = np.array(contingency, dtype='float') # Calculate the MI for the two clusterings mi = mutual_info_score(labels_true, labels_pred, contingency=contingency) # Calculate the expected value for the mutual information # Calculate entropy for each labeling h_true, h_pred = entropy(labels_true), entropy(labels_pred) nmi = mi / max(max(h_true, h_pred), 1e-10) return nmi
def mutual_info_score(labels_true, labels_pred, contingency=None): """Mutual Information between two clusterings The Mutual Information is a measure of the similarity between two labels of the same data. Where :math:`P(i)` is the probability of a random sample occurring in cluster :math:`U_i` and :math:`P'(j)` is the probability of a random sample occurring in cluster :math:`V_j`, the Mutual Information between clusterings :math:`U` and :math:`V` is given as: .. math:: MI(U,V)=\sum_{i=1}^R \sum_{j=1}^C P(i,j)\log\\frac{P(i,j)}{P(i)P'(j)} markdown形式,跟熵的形式有点像 $$MI(U,V)=\sum_{i=1}^R \sum_{j=1}^C P(i,j)\log\frac{P(i,j)}{P(i)P(j)}$$ This is equal to the Kullback-Leibler divergence of the joint distribution with the product distribution of the marginals. This metric is independent of the absolute values of the labels: a permutation of the class or cluster label values won't change the score value in any way. This metric is furthermore symmetric: switching ``label_true`` with ``label_pred`` will return the same score value. This can be useful to measure the agreement of two independent label assignments strategies on the same dataset when the real ground truth is not known. Read more in the :ref:`User Guide <mutual_info_score>`. Parameters ---------- labels_true : int array, shape = [n_samples] A clustering of the data into disjoint subsets. labels_pred : array, shape = [n_samples] A clustering of the data into disjoint subsets. contingency: None or array, shape = [n_classes_true, n_classes_pred] A contingency matrix given by the :func:`contingency_matrix` function. If value is ``None``, it will be computed, otherwise the given value is used, with ``labels_true`` and ``labels_pred`` ignored. Returns ------- mi: float Mutual information, a non-negative value See also -------- adjusted_mutual_info_score: Adjusted against chance Mutual Information normalized_mutual_info_score: Normalized Mutual Information """ if contingency is None: labels_true, labels_pred = check_clusterings(labels_true, labels_pred) contingency = contingency_matrix(labels_true, labels_pred) contingency = np.array(contingency, dtype='float') # 样本总数 contingency_sum = np.sum(contingency) # 每个行标对应的样本出现次数 pi = np.sum(contingency, axis=1) # 每个列标对应的样本出现次数 pj = np.sum(contingency, axis=0) "out[i, j] = n[i] * n[j]" outer = np.outer(pi, pj) nnz = contingency != 0.0 # normalized contingency "将nnz为true的值,筛选出来,组成一个一维数组" contingency_nm = contingency[nnz] "log(N(i,j))" log_contingency_nm = np.log(contingency_nm) "contingency_nm : p(i, j)" contingency_nm /= contingency_sum # log(a / b) should be calculated as log(a) - log(b) for # possible loss of precision "pi.sum()等于pj.sum,均是样本总数" "log_outer : -log(p(i)p(j))" log_outer = -np.log(outer[nnz]) + math.log(pi.sum()) + math.log(pj.sum()) "log_contingency_nm - log(contingency_sum) : log(p(i,j))" mi = (contingency_nm * (log_contingency_nm - math.log(contingency_sum)) + contingency_nm * log_outer) return mi.sum()