Пример #1
0
    def plot_similarity_array(self):

        abs_corr_array = self.workflow.redundancy_method.similarity_array

        op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks)
        names = hlp.ind_map_subset(
            op_id_name_map[0], op_id_name_map[1],
            self.workflow.redundancy_method.similarity_array_op_ids)
        measures = np.zeros((2, len(names)))

        tmp_ind = hlp.ismember(
            self.workflow.redundancy_method.similarity_array_op_ids,
            self.workflow.good_op_ids)

        # -- number of problems for which each good performing feature has been calculated
        measures[0, :] = (~self.workflow.stats_good_op[:, tmp_ind].mask).sum(
            axis=0)
        # -- z scored u-stat(for all features) for top features
        stats_good_op_z_score = fap.normalise_masked_array(
            self.workflow.stats_good_op_comb, axis=0, norm_type='zscore')[0]
        measures[1, :] = stats_good_op_z_score[tmp_ind]

        fiplt.plot_arr_dendrogram(abs_corr_array,
                                  names,
                                  max_dist_cluster=self.max_dist_cluster,
                                  measures=measures)
Пример #2
0
 def plot_similarity_array(self):
     
     abs_corr_array = self.workflow.redundancy_method.similarity_array
     
     op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks)
     names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids)
     measures = np.zeros((2,len(names)))
    
     
     tmp_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids, 
                    self.workflow.good_op_ids)
     
     # -- number of problems for which each good performing feature has been calculated
     measures[0,:] = (~self.workflow.stats_good_op[:,tmp_ind].mask).sum(axis=0)
     # -- z scored u-stat(for all features) for top features 
     stats_good_op_z_score = fap.normalise_masked_array(self.workflow.stats_good_op_comb, axis= 0,norm_type = 'zscore')[0]
     measures[1,:] = stats_good_op_z_score[tmp_ind]
     
     fiplt.plot_arr_dendrogram(abs_corr_array,names,max_dist_cluster=self.max_dist_cluster,measures = measures)
Пример #3
0
    def select_good_perf_ops_sort_asc(self):
        """
        Select a subset of well performing operations
        """
                
        if self.select_good_perf_ops_norm in ['z-score','zscore'] :
            all_classes_good_norm = fap.normalise_masked_array(self.stats_good_op,axis = 1,norm_type = 'zscore')[0]
        
        elif self.select_good_perf_ops_norm == 'mean-norm':
            all_classes_good_mean = np.ma.masked_invalid(np.ma.mean(self.stats_good_op,axis = 1))
            all_classes_good_norm = (self.stats_good_op.T / all_classes_good_mean).T  
        
        else:
            all_classes_good_norm =  self.stats_good_op

        sort_ind_tmp = np.argsort(all_classes_good_norm.mean(axis=0))
        
        if self.n_good_perf_ops == None:
            self.stats_good_perf_op_comb  = self.stats_good_op_comb[sort_ind_tmp]
            self.good_perf_op_ids =  self.good_op_ids[sort_ind_tmp]
        else:
            self.stats_good_perf_op_comb  = self.stats_good_op_comb[sort_ind_tmp][:self.n_good_perf_ops]            
            self.good_perf_op_ids =  self.good_op_ids[sort_ind_tmp][:self.n_good_perf_ops]            
Пример #4
0
# -- Are the HCTSA files calculated with old version of matlab code
IS_FROM_OLD_MATLAB = True

# -- What has to be done
COMPUTE_COMPLETE_DATA = True 
CALCULATE_U_STATS = True
CALCULATE_ONLY_NEW_U_STATS = False
CALCULATE_U_STATS_ALL_CLASSES_AVG = True
#CALCULATE_BEST_FEATURES = False

# ---------------------------------------------------------------------------------
# -- Compute complete data array for good op_ids
# ---------------------------------------------------------------------------------
    
if COMPUTE_COMPLETE_DATA:
    data_all,op_id_good = fap.cat_data_from_matfile_root(mat_file_paths, count_op_id_min,is_from_old_matlab = IS_FROM_OLD_MATLAB,
                               data_all_good_op_path = data_all_good_op_path,op_id_good_path = op_id_good_path,is_return_masked = False)

# -- Create masked array from data_all    
# data_all = np.ma.masked_invalid(data_all)

# ---------------------------------------------------------------------------------
# -- Calculate U_statistics for the problems
# ---------------------------------------------------------------------------------   
if CALCULATE_U_STATS:
    
    # -- skip problems with already calculated U-stats
    if CALCULATE_ONLY_NEW_U_STATS:
        task_names = tstat.filter_calculated(mat_file_root,HCTSA_name_search_pattern = 'HCTSA_(.*)_N_70_100_reduced.mat')
        file_paths = [mat_file_root+"HCTSA_{0:s}_N_70_100_reduced.mat".format(s) for s in task_names]
    
    # -- calculate U-stats for all problems        
# -- plot dendrogram --------------------------------------------------
corr_dendrogram = hierarchy.dendrogram(corr_linkage, orientation="left", no_plot=True)
hierarchy.dendrogram(corr_linkage, orientation="left", p=50, truncate_mode="lastp", ax=ax_dendr)
ax_dendr.set_yticks([])

ax_dendr.axvline(max_corr_dist, ls="--", c="k")

# -- plot sorted U-Stat array ------------------------------------------

# -- create index that sort rows to correspond to dendrogram
feat_sort_ind = corr_dendrogram["leaves"]
# -- create index that sort columns with respect to their mean value
porblem_sort_ind = np.argsort(all_classes_avg_top[:, feat_sort_ind].mean(axis=1))
print porblem_sort_ind
# all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0))
all_classes_avg_top = fap.normalise_masked_array(all_classes_avg_top, axis=1, norm_type="zscore")[0]
# -- plot the operation names as y-axis tick labels
ax_ustat_arr.matshow(
    all_classes_avg_top[porblem_sort_ind, :][:, feat_sort_ind].T, aspect=39 / float(50), origin="bottom"
)
ax_ustat_arr.set_yticks(range(len(feat_sort_ind)))
ax_ustat_arr.set_yticklabels(np.array(names)[feat_sort_ind])
# -- plot the problem names as x axis labels
ax_ustat_arr.xaxis.tick_bottom()
ax_ustat_arr.set_xticks(range(all_classes_avg_top.shape[0]))
ax_ustat_arr.set_xticklabels(problem_names[porblem_sort_ind], rotation="vertical")

# -- calculate and plot clusters ----------------------------------
cluster_ind = hierarchy.fcluster(corr_linkage, t=max_corr_dist, criterion="distance")
cluster_bounds = np.nonzero(np.diff(cluster_ind[feat_sort_ind]))[0] + 0.5
for cluster_bound in cluster_bounds:
Пример #6
0
                     ax=ax_dendr)
ax_dendr.set_yticks([])

ax_dendr.axvline(max_corr_dist, ls='--', c='k')

# -- plot sorted U-Stat array ------------------------------------------

# -- create index that sort rows to correspond to dendrogram
feat_sort_ind = corr_dendrogram['leaves']
# -- create index that sort columns with respect to their mean value
porblem_sort_ind = np.argsort(all_classes_avg_top[:,
                                                  feat_sort_ind].mean(axis=1))
print porblem_sort_ind
#all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0))
all_classes_avg_top = fap.normalise_masked_array(all_classes_avg_top,
                                                 axis=1,
                                                 norm_type='zscore')[0]
# -- plot the operation names as y-axis tick labels
ax_ustat_arr.matshow(all_classes_avg_top[porblem_sort_ind, :][:,
                                                              feat_sort_ind].T,
                     aspect=39 / float(50),
                     origin='bottom')
ax_ustat_arr.set_yticks(range(len(feat_sort_ind)))
ax_ustat_arr.set_yticklabels(np.array(names)[feat_sort_ind])
# -- plot the problem names as x axis labels
ax_ustat_arr.xaxis.tick_bottom()
ax_ustat_arr.set_xticks(range(all_classes_avg_top.shape[0]))
ax_ustat_arr.set_xticklabels(problem_names[porblem_sort_ind],
                             rotation='vertical')

# -- calculate and plot clusters ----------------------------------
Пример #7
0
# -- What has to be done
COMPUTE_COMPLETE_DATA = True
CALCULATE_U_STATS = True
CALCULATE_ONLY_NEW_U_STATS = False
CALCULATE_U_STATS_ALL_CLASSES_AVG = True
#CALCULATE_BEST_FEATURES = False

# ---------------------------------------------------------------------------------
# -- Compute complete data array for good op_ids
# ---------------------------------------------------------------------------------

if COMPUTE_COMPLETE_DATA:
    data_all, op_id_good = fap.cat_data_from_matfile_root(
        mat_file_paths,
        count_op_id_min,
        is_from_old_matlab=IS_FROM_OLD_MATLAB,
        data_all_good_op_path=data_all_good_op_path,
        op_id_good_path=op_id_good_path,
        is_return_masked=False)

# -- Create masked array from data_all
# data_all = np.ma.masked_invalid(data_all)

# ---------------------------------------------------------------------------------
# -- Calculate U_statistics for the problems
# ---------------------------------------------------------------------------------
if CALCULATE_U_STATS:

    # -- skip problems with already calculated U-stats
    if CALCULATE_ONLY_NEW_U_STATS:
        task_names = tstat.filter_calculated(
Пример #8
0
    def __init__(self,task_names,input_method,stats_method,redundancy_method,combine_tasks_method = 'mean',
                 combine_tasks_norm = None,
                 select_good_perf_ops_method = 'sort_asc',
                 select_good_perf_ops_norm = 'zscore',
                 n_good_perf_ops = None):
        """
        Constructor
        Parameters:
        -----------
        task_names : list of str
            A list of task names to be included in this workflow
        input_method : Data_Input
            The data input method used to read the data from disk. 
        stat_method : Feature_Stats
            The mehtod used to calculate the statistics
        redundancy_method : Reducing_Redundancy   
            The method used to reduce the redundancy in the well performing features
        combine_tasks_method : str
            The name describing the method used to combine the statistics for each task to create a single 1d arrray with 
            a single entry for every operation
        combine_tasks_norm : str
            The name of the normalisation method applied to the stats of each task before the statistics for each task are combined
        select_good_perf_ops_method : str
            The name describing the method used to sort the operations so the best operations come first in the self.stats_good_perf_op_comb
             and self.good_perf_op_ids
        select_good_perf_ops_norm : str
            The name describing the norm used when combining all statistics for all tasks for each operations
        self.n_good_perf_op_ids : int, optional
            Maximum entries in self.stats_good_perf_op_comb and self.good_perf_op_ids. If None, all good operations are used.
        """
        self.task_names = task_names
        self.input_method = input_method
        self.stats_method = stats_method        
        self.redundancy_method = redundancy_method
        self.combine_tasks_norm = combine_tasks_norm
        #normalise_array(data,axis,norm_type = 'zscore')
        if combine_tasks_method == 'mean':
            self.combine_tasks = self.combine_task_stats_mean
            
        if combine_tasks_norm == 'zscore':
            self.combine_task_norm_method = lambda y : fap.normalise_masked_array(y,axis = 1,norm_type = 'zscore')[0]
        else:
            # -- no normalisation - id-function
            self.combine_task_norm_method = lambda y : y

        if select_good_perf_ops_method == 'sort_asc':
            self.select_good_perf_ops = self.select_good_perf_ops_sort_asc
        self.select_good_perf_ops_norm = select_good_perf_ops_norm
        
        self.n_good_perf_ops = n_good_perf_ops 
        # -- list of Tasks for this workflow
        self.tasks = [Task.Task(task_name,self.input_method,self.stats_method) for task_name in task_names]

        # -- Counter array for number of problems calculated successfully for each operation
        
        # -- place holders 
        self.good_op_ids = []
        self.stats_good_op = None
        self.stats_good_op_comb = None
        self.stats_good_perf_op_comb = None
        self.good_perf_op_ids = None
Пример #9
0
    
    
    # -----------------------------------------------------------------
    # -- Output the results to text file-------------------------------
    # -----------------------------------------------------------------       
    op_id_name_map = plotting.map_op_id_name_mult_task(workflow.tasks)
    # -- write not reduced top performing features indicating the respective clusters they belong to
    # -- number of problems for which each good performing feature has been calculated
    measures = np.zeros((3,len(workflow.good_op_ids)))
    # -- op_ids
    measures[0,:] =  workflow.good_op_ids
    # -- number of problems calculated
    measures[1,:] = (~workflow.stats_good_op.mask).sum(axis=0)
    # -- z scored u-stat
    measures[2,:] = fap.normalise_masked_array(workflow.stats_good_op_comb, axis= 0,norm_type = 'zscore')[0]
   
    # -- write textfile containing the information as shown in the plot     
    workflow.redundancy_method.write_cluster_file(result_txt_outpath,op_id_name_map,measures) 
    
    
    # -----------------------------------------------------------------    
    # -- show the plot as last task of the script
    # -----------------------------------------------------------------    
    plt.show()

    # -- write not reduced top performing features to text file
#     with open(result_txt_outpath,'wb') as out_result_txt_file:
#         for op_id,op_name,op_U in zip(workflow.good_perf_op_ids,
#                                       hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1], workflow.good_perf_op_ids),
#                                       workflow.stats_good_perf_op_comb):
Пример #10
0
    abs_corr_dist_arr = np.around(1 - abs_corr_array, 7)

    # -- transform the correlation matrix into condensed distance matrix
    dist_corr = spdst.squareform(abs_corr_dist_arr)

    # -- force calculation of linkage
    is_force_calc_link_arr = True

else:
    # -- skip calculation and load linkage from link_arr_path
    is_force_calc_link_arr = False
    abs_corr_dist_arr = None

# -- cluster of indices in abs_corr_dist_arr array
cluster_lst, cluster_size_lst = fap.compute_clusters_from_dist(
    abs_corr_dist_arr=abs_corr_dist_arr,
    link_arr_path=link_arr_path,
    is_force_calc_link_arr=is_force_calc_link_arr)

# -- cluster of operation ids
op_id_cluster = [[sorted_op_ids[ind] for ind in cluster]
                 for cluster in cluster_lst]

# -- load a reference HCTSA_loc.mat containing all op_ids
import modules.misc.PK_matlab_IO as mIO
op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat'
op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ['Operations'],
                             is_from_old_matlab=True)

# -- create the clusters of information
tuple_cluster_lst = []
for cluster in cluster_lst:
    # -- transform the correlation matrix into distance measure
    abs_corr_dist_arr = np.around(1 - abs_corr_array,7)
 
    # -- transform the correlation matrix into condensed distance matrix
    dist_corr = spdst.squareform(abs_corr_dist_arr)   
    
    # -- force calculation of linkage
    is_force_calc_link_arr = True     
   
else:
    # -- skip calculation and load linkage from link_arr_path
    is_force_calc_link_arr = False     
    abs_corr_dist_arr = None

# -- cluster of indices in abs_corr_dist_arr array 
cluster_lst, cluster_size_lst = fap.compute_clusters_from_dist(abs_corr_dist_arr=abs_corr_dist_arr,link_arr_path = link_arr_path, is_force_calc_link_arr = is_force_calc_link_arr)   

# -- cluster of operation ids
op_id_cluster = [[sorted_op_ids[ind] for ind in cluster] for cluster in cluster_lst]




# -- load a reference HCTSA_loc.mat containing all op_ids
import modules.misc.PK_matlab_IO as mIO
op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat'
op, = mIO.read_from_mat_file(op_ref_HCTSA_path,['Operations'],is_from_old_matlab = True)


# -- create the clusters of information
tuple_cluster_lst = []