def plot_similarity_array(self): abs_corr_array = self.workflow.redundancy_method.similarity_array op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks) names = hlp.ind_map_subset( op_id_name_map[0], op_id_name_map[1], self.workflow.redundancy_method.similarity_array_op_ids) measures = np.zeros((2, len(names))) tmp_ind = hlp.ismember( self.workflow.redundancy_method.similarity_array_op_ids, self.workflow.good_op_ids) # -- number of problems for which each good performing feature has been calculated measures[0, :] = (~self.workflow.stats_good_op[:, tmp_ind].mask).sum( axis=0) # -- z scored u-stat(for all features) for top features stats_good_op_z_score = fap.normalise_masked_array( self.workflow.stats_good_op_comb, axis=0, norm_type='zscore')[0] measures[1, :] = stats_good_op_z_score[tmp_ind] fiplt.plot_arr_dendrogram(abs_corr_array, names, max_dist_cluster=self.max_dist_cluster, measures=measures)
def plot_similarity_array(self): abs_corr_array = self.workflow.redundancy_method.similarity_array op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks) names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids) measures = np.zeros((2,len(names))) tmp_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids, self.workflow.good_op_ids) # -- number of problems for which each good performing feature has been calculated measures[0,:] = (~self.workflow.stats_good_op[:,tmp_ind].mask).sum(axis=0) # -- z scored u-stat(for all features) for top features stats_good_op_z_score = fap.normalise_masked_array(self.workflow.stats_good_op_comb, axis= 0,norm_type = 'zscore')[0] measures[1,:] = stats_good_op_z_score[tmp_ind] fiplt.plot_arr_dendrogram(abs_corr_array,names,max_dist_cluster=self.max_dist_cluster,measures = measures)
def select_good_perf_ops_sort_asc(self): """ Select a subset of well performing operations """ if self.select_good_perf_ops_norm in ['z-score','zscore'] : all_classes_good_norm = fap.normalise_masked_array(self.stats_good_op,axis = 1,norm_type = 'zscore')[0] elif self.select_good_perf_ops_norm == 'mean-norm': all_classes_good_mean = np.ma.masked_invalid(np.ma.mean(self.stats_good_op,axis = 1)) all_classes_good_norm = (self.stats_good_op.T / all_classes_good_mean).T else: all_classes_good_norm = self.stats_good_op sort_ind_tmp = np.argsort(all_classes_good_norm.mean(axis=0)) if self.n_good_perf_ops == None: self.stats_good_perf_op_comb = self.stats_good_op_comb[sort_ind_tmp] self.good_perf_op_ids = self.good_op_ids[sort_ind_tmp] else: self.stats_good_perf_op_comb = self.stats_good_op_comb[sort_ind_tmp][:self.n_good_perf_ops] self.good_perf_op_ids = self.good_op_ids[sort_ind_tmp][:self.n_good_perf_ops]
# -- Are the HCTSA files calculated with old version of matlab code IS_FROM_OLD_MATLAB = True # -- What has to be done COMPUTE_COMPLETE_DATA = True CALCULATE_U_STATS = True CALCULATE_ONLY_NEW_U_STATS = False CALCULATE_U_STATS_ALL_CLASSES_AVG = True #CALCULATE_BEST_FEATURES = False # --------------------------------------------------------------------------------- # -- Compute complete data array for good op_ids # --------------------------------------------------------------------------------- if COMPUTE_COMPLETE_DATA: data_all,op_id_good = fap.cat_data_from_matfile_root(mat_file_paths, count_op_id_min,is_from_old_matlab = IS_FROM_OLD_MATLAB, data_all_good_op_path = data_all_good_op_path,op_id_good_path = op_id_good_path,is_return_masked = False) # -- Create masked array from data_all # data_all = np.ma.masked_invalid(data_all) # --------------------------------------------------------------------------------- # -- Calculate U_statistics for the problems # --------------------------------------------------------------------------------- if CALCULATE_U_STATS: # -- skip problems with already calculated U-stats if CALCULATE_ONLY_NEW_U_STATS: task_names = tstat.filter_calculated(mat_file_root,HCTSA_name_search_pattern = 'HCTSA_(.*)_N_70_100_reduced.mat') file_paths = [mat_file_root+"HCTSA_{0:s}_N_70_100_reduced.mat".format(s) for s in task_names] # -- calculate U-stats for all problems
# -- plot dendrogram -------------------------------------------------- corr_dendrogram = hierarchy.dendrogram(corr_linkage, orientation="left", no_plot=True) hierarchy.dendrogram(corr_linkage, orientation="left", p=50, truncate_mode="lastp", ax=ax_dendr) ax_dendr.set_yticks([]) ax_dendr.axvline(max_corr_dist, ls="--", c="k") # -- plot sorted U-Stat array ------------------------------------------ # -- create index that sort rows to correspond to dendrogram feat_sort_ind = corr_dendrogram["leaves"] # -- create index that sort columns with respect to their mean value porblem_sort_ind = np.argsort(all_classes_avg_top[:, feat_sort_ind].mean(axis=1)) print porblem_sort_ind # all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0)) all_classes_avg_top = fap.normalise_masked_array(all_classes_avg_top, axis=1, norm_type="zscore")[0] # -- plot the operation names as y-axis tick labels ax_ustat_arr.matshow( all_classes_avg_top[porblem_sort_ind, :][:, feat_sort_ind].T, aspect=39 / float(50), origin="bottom" ) ax_ustat_arr.set_yticks(range(len(feat_sort_ind))) ax_ustat_arr.set_yticklabels(np.array(names)[feat_sort_ind]) # -- plot the problem names as x axis labels ax_ustat_arr.xaxis.tick_bottom() ax_ustat_arr.set_xticks(range(all_classes_avg_top.shape[0])) ax_ustat_arr.set_xticklabels(problem_names[porblem_sort_ind], rotation="vertical") # -- calculate and plot clusters ---------------------------------- cluster_ind = hierarchy.fcluster(corr_linkage, t=max_corr_dist, criterion="distance") cluster_bounds = np.nonzero(np.diff(cluster_ind[feat_sort_ind]))[0] + 0.5 for cluster_bound in cluster_bounds:
ax=ax_dendr) ax_dendr.set_yticks([]) ax_dendr.axvline(max_corr_dist, ls='--', c='k') # -- plot sorted U-Stat array ------------------------------------------ # -- create index that sort rows to correspond to dendrogram feat_sort_ind = corr_dendrogram['leaves'] # -- create index that sort columns with respect to their mean value porblem_sort_ind = np.argsort(all_classes_avg_top[:, feat_sort_ind].mean(axis=1)) print porblem_sort_ind #all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0)) all_classes_avg_top = fap.normalise_masked_array(all_classes_avg_top, axis=1, norm_type='zscore')[0] # -- plot the operation names as y-axis tick labels ax_ustat_arr.matshow(all_classes_avg_top[porblem_sort_ind, :][:, feat_sort_ind].T, aspect=39 / float(50), origin='bottom') ax_ustat_arr.set_yticks(range(len(feat_sort_ind))) ax_ustat_arr.set_yticklabels(np.array(names)[feat_sort_ind]) # -- plot the problem names as x axis labels ax_ustat_arr.xaxis.tick_bottom() ax_ustat_arr.set_xticks(range(all_classes_avg_top.shape[0])) ax_ustat_arr.set_xticklabels(problem_names[porblem_sort_ind], rotation='vertical') # -- calculate and plot clusters ----------------------------------
# -- What has to be done COMPUTE_COMPLETE_DATA = True CALCULATE_U_STATS = True CALCULATE_ONLY_NEW_U_STATS = False CALCULATE_U_STATS_ALL_CLASSES_AVG = True #CALCULATE_BEST_FEATURES = False # --------------------------------------------------------------------------------- # -- Compute complete data array for good op_ids # --------------------------------------------------------------------------------- if COMPUTE_COMPLETE_DATA: data_all, op_id_good = fap.cat_data_from_matfile_root( mat_file_paths, count_op_id_min, is_from_old_matlab=IS_FROM_OLD_MATLAB, data_all_good_op_path=data_all_good_op_path, op_id_good_path=op_id_good_path, is_return_masked=False) # -- Create masked array from data_all # data_all = np.ma.masked_invalid(data_all) # --------------------------------------------------------------------------------- # -- Calculate U_statistics for the problems # --------------------------------------------------------------------------------- if CALCULATE_U_STATS: # -- skip problems with already calculated U-stats if CALCULATE_ONLY_NEW_U_STATS: task_names = tstat.filter_calculated(
def __init__(self,task_names,input_method,stats_method,redundancy_method,combine_tasks_method = 'mean', combine_tasks_norm = None, select_good_perf_ops_method = 'sort_asc', select_good_perf_ops_norm = 'zscore', n_good_perf_ops = None): """ Constructor Parameters: ----------- task_names : list of str A list of task names to be included in this workflow input_method : Data_Input The data input method used to read the data from disk. stat_method : Feature_Stats The mehtod used to calculate the statistics redundancy_method : Reducing_Redundancy The method used to reduce the redundancy in the well performing features combine_tasks_method : str The name describing the method used to combine the statistics for each task to create a single 1d arrray with a single entry for every operation combine_tasks_norm : str The name of the normalisation method applied to the stats of each task before the statistics for each task are combined select_good_perf_ops_method : str The name describing the method used to sort the operations so the best operations come first in the self.stats_good_perf_op_comb and self.good_perf_op_ids select_good_perf_ops_norm : str The name describing the norm used when combining all statistics for all tasks for each operations self.n_good_perf_op_ids : int, optional Maximum entries in self.stats_good_perf_op_comb and self.good_perf_op_ids. If None, all good operations are used. """ self.task_names = task_names self.input_method = input_method self.stats_method = stats_method self.redundancy_method = redundancy_method self.combine_tasks_norm = combine_tasks_norm #normalise_array(data,axis,norm_type = 'zscore') if combine_tasks_method == 'mean': self.combine_tasks = self.combine_task_stats_mean if combine_tasks_norm == 'zscore': self.combine_task_norm_method = lambda y : fap.normalise_masked_array(y,axis = 1,norm_type = 'zscore')[0] else: # -- no normalisation - id-function self.combine_task_norm_method = lambda y : y if select_good_perf_ops_method == 'sort_asc': self.select_good_perf_ops = self.select_good_perf_ops_sort_asc self.select_good_perf_ops_norm = select_good_perf_ops_norm self.n_good_perf_ops = n_good_perf_ops # -- list of Tasks for this workflow self.tasks = [Task.Task(task_name,self.input_method,self.stats_method) for task_name in task_names] # -- Counter array for number of problems calculated successfully for each operation # -- place holders self.good_op_ids = [] self.stats_good_op = None self.stats_good_op_comb = None self.stats_good_perf_op_comb = None self.good_perf_op_ids = None
# ----------------------------------------------------------------- # -- Output the results to text file------------------------------- # ----------------------------------------------------------------- op_id_name_map = plotting.map_op_id_name_mult_task(workflow.tasks) # -- write not reduced top performing features indicating the respective clusters they belong to # -- number of problems for which each good performing feature has been calculated measures = np.zeros((3,len(workflow.good_op_ids))) # -- op_ids measures[0,:] = workflow.good_op_ids # -- number of problems calculated measures[1,:] = (~workflow.stats_good_op.mask).sum(axis=0) # -- z scored u-stat measures[2,:] = fap.normalise_masked_array(workflow.stats_good_op_comb, axis= 0,norm_type = 'zscore')[0] # -- write textfile containing the information as shown in the plot workflow.redundancy_method.write_cluster_file(result_txt_outpath,op_id_name_map,measures) # ----------------------------------------------------------------- # -- show the plot as last task of the script # ----------------------------------------------------------------- plt.show() # -- write not reduced top performing features to text file # with open(result_txt_outpath,'wb') as out_result_txt_file: # for op_id,op_name,op_U in zip(workflow.good_perf_op_ids, # hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1], workflow.good_perf_op_ids), # workflow.stats_good_perf_op_comb):
abs_corr_dist_arr = np.around(1 - abs_corr_array, 7) # -- transform the correlation matrix into condensed distance matrix dist_corr = spdst.squareform(abs_corr_dist_arr) # -- force calculation of linkage is_force_calc_link_arr = True else: # -- skip calculation and load linkage from link_arr_path is_force_calc_link_arr = False abs_corr_dist_arr = None # -- cluster of indices in abs_corr_dist_arr array cluster_lst, cluster_size_lst = fap.compute_clusters_from_dist( abs_corr_dist_arr=abs_corr_dist_arr, link_arr_path=link_arr_path, is_force_calc_link_arr=is_force_calc_link_arr) # -- cluster of operation ids op_id_cluster = [[sorted_op_ids[ind] for ind in cluster] for cluster in cluster_lst] # -- load a reference HCTSA_loc.mat containing all op_ids import modules.misc.PK_matlab_IO as mIO op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat' op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ['Operations'], is_from_old_matlab=True) # -- create the clusters of information tuple_cluster_lst = [] for cluster in cluster_lst:
# -- transform the correlation matrix into distance measure abs_corr_dist_arr = np.around(1 - abs_corr_array,7) # -- transform the correlation matrix into condensed distance matrix dist_corr = spdst.squareform(abs_corr_dist_arr) # -- force calculation of linkage is_force_calc_link_arr = True else: # -- skip calculation and load linkage from link_arr_path is_force_calc_link_arr = False abs_corr_dist_arr = None # -- cluster of indices in abs_corr_dist_arr array cluster_lst, cluster_size_lst = fap.compute_clusters_from_dist(abs_corr_dist_arr=abs_corr_dist_arr,link_arr_path = link_arr_path, is_force_calc_link_arr = is_force_calc_link_arr) # -- cluster of operation ids op_id_cluster = [[sorted_op_ids[ind] for ind in cluster] for cluster in cluster_lst] # -- load a reference HCTSA_loc.mat containing all op_ids import modules.misc.PK_matlab_IO as mIO op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat' op, = mIO.read_from_mat_file(op_ref_HCTSA_path,['Operations'],is_from_old_matlab = True) # -- create the clusters of information tuple_cluster_lst = []