Exemplo n.º 1
0
    def plot_similarity_array(self):

        abs_corr_array = self.workflow.redundancy_method.similarity_array

        op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks)
        names = hlp.ind_map_subset(
            op_id_name_map[0], op_id_name_map[1],
            self.workflow.redundancy_method.similarity_array_op_ids)
        measures = np.zeros((2, len(names)))

        tmp_ind = hlp.ismember(
            self.workflow.redundancy_method.similarity_array_op_ids,
            self.workflow.good_op_ids)

        # -- number of problems for which each good performing feature has been calculated
        measures[0, :] = (~self.workflow.stats_good_op[:, tmp_ind].mask).sum(
            axis=0)
        # -- z scored u-stat(for all features) for top features
        stats_good_op_z_score = fap.normalise_masked_array(
            self.workflow.stats_good_op_comb, axis=0, norm_type='zscore')[0]
        measures[1, :] = stats_good_op_z_score[tmp_ind]

        fiplt.plot_arr_dendrogram(abs_corr_array,
                                  names,
                                  max_dist_cluster=self.max_dist_cluster,
                                  measures=measures)
 def reduce_to_good_perf_ops(self,ops_base_vals,good_perf_op_ids,good_op_ids):
     """
     Reduce the ops_base_vals by keeping only the columns corresponding to the op_ids in self.good_perf_op_ids
     Parameters:
     -----------
     ops_base_vals : nd array
         Array containing the values on which the similarity of the operations will be calculated
     good_op_ids : ndarray
         The op_ids of the columns in  ops_base_vals
     good_perf_op_ids : ndarray
         The op_ids of the features we are interested in
     Returns:
     --------
     ops_base_perf_vals : ndarray
         ops_base_vals reduced to contain only operations with ids given in good_perf_op_ids with the same ordering.
     """
     good_perf_ind = hlp.ismember(good_perf_op_ids,good_op_ids)
     ops_base_perf_vals = ops_base_vals[:,good_perf_ind]
     return ops_base_perf_vals
Exemplo n.º 3
0
 def plot_similarity_array(self):
     
     abs_corr_array = self.workflow.redundancy_method.similarity_array
     
     op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks)
     names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids)
     measures = np.zeros((2,len(names)))
    
     
     tmp_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids, 
                    self.workflow.good_op_ids)
     
     # -- number of problems for which each good performing feature has been calculated
     measures[0,:] = (~self.workflow.stats_good_op[:,tmp_ind].mask).sum(axis=0)
     # -- z scored u-stat(for all features) for top features 
     stats_good_op_z_score = fap.normalise_masked_array(self.workflow.stats_good_op_comb, axis= 0,norm_type = 'zscore')[0]
     measures[1,:] = stats_good_op_z_score[tmp_ind]
     
     fiplt.plot_arr_dendrogram(abs_corr_array,names,max_dist_cluster=self.max_dist_cluster,measures = measures)
Exemplo n.º 4
0
 def reduce_to_good_perf_ops(self, ops_base_vals, good_perf_op_ids,
                             good_op_ids):
     """
     Reduce the ops_base_vals by keeping only the columns corresponding to the op_ids in self.good_perf_op_ids
     Parameters:
     -----------
     ops_base_vals : nd array
         Array containing the values on which the similarity of the operations will be calculated
     good_op_ids : ndarray
         The op_ids of the columns in  ops_base_vals
     good_perf_op_ids : ndarray
         The op_ids of the features we are interested in
     Returns:
     --------
     ops_base_perf_vals : ndarray
         ops_base_vals reduced to contain only operations with ids given in good_perf_op_ids with the same ordering.
     """
     good_perf_ind = hlp.ismember(good_perf_op_ids, good_op_ids)
     ops_base_perf_vals = ops_base_vals[:, good_perf_ind]
     return ops_base_perf_vals
Exemplo n.º 5
0
 def collect_stats_good_op_ids(self):
     """
     Collect all combined stats for each task and take stats for good operations only
     """
     #stats_good_op_ma = np.empty((data.shape[0],np.array(self.good_op_ids).shape[0]))
     stats_good_op_tmp = []
     #stats_good_op_ma[:] = np.NaN
     for task in self.tasks:
         # -- create tmp array for good stats for current task. For sake of simplicity when dealing with different
         # dimensions of task.tot_stats we transpose stats_good_op_ma_tmp so row corresponds to feature temporarily
         if task.tot_stats.ndim > 1:
             stats_good_op_ma_tmp = np.empty((self.good_op_ids.shape[0],task.tot_stats.shape[0]))
         else:
             stats_good_op_ma_tmp = np.empty((self.good_op_ids.shape[0]))
         stats_good_op_ma_tmp[:] = np.NaN
         
         ind = hlp.ismember(task.op_ids,self.good_op_ids,is_return_masked_array = True,return_dtype = int)
         # -- it is position in task.op_ids and i is position in self.good_op_ids
         for it,i in enumerate(ind):
             if i is not np.ma.masked: # -- that means the entry in task.op_ids is also in self.good_op_ids
                 stats_good_op_ma_tmp[i] = task.tot_stats[it].T
         # -- We return to the usual ordering: column equals feature
         stats_good_op_tmp.append(stats_good_op_ma_tmp.T)
     self.stats_good_op = np.ma.masked_invalid(np.vstack(stats_good_op_tmp))
Exemplo n.º 6
0
def cat_data_op_subset(file_paths,
                       op_id_top,
                       is_from_old_matlab=False,
                       is_return_masked=True):
    """
    Concatenate the features where op_id is in op_id_top for all HCTSA_loc.m files pointed to by file_paths.
    Warning, this can take a while and the returned data matrix can be very large.
    XXX WARNING XXX This only works correctly if all HCTSA_loc.mat files come from the same
    database. Meaning op_ids are the same. Otherwise one would have to go through operation names which is
    only a little more work to implement. XXX
    Parameters:
    -----------
    file_paths : list
        list of file paths pointing to the files containing the data
    op_id_top : list,ndarray
        list of operation ids wanted in the concatenated data array
    is_from_old_matlab : bool
        If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different.
    is_return_masked : boolean
        Saving large masked arrays to disk can lead to memory errors while pickling. If this is false funtion
         returns a normal ndarray with unknown entires are set to NaN. This can be converted to a masked array with 
         data_all = np.ma.masked_invalid(data_all)
    Returns:
    --------
    data_all : ndarray/masked ndarray
        Concatenated data array
   """
    is_first = True
    data_all = None

    for file_path in file_paths:
        print "Adding data from {:s} \n to complete data matrix".format(
            file_path)
        data, op = mIO.read_from_mat_file(
            file_path, ['TS_DataMat', 'Operations'],
            is_from_old_matlab=is_from_old_matlab)

        # -- find the indices in the data for for op_id_top
        ind = hlp.ismember(op['id'],
                           op_id_top,
                           is_return_masked_array=True,
                           return_dtype=int)
        # -- if any of the operations was not calculated for this problem
        # -- create a masked array and copy only valid data and mask
        # -- invalid data
        if ind.data != op_id_top:
            # -- create an masked array filled with NaN.
            # -- This makes later masking of non-existent entries easier
            # -- each column of data_ma corresponds to the op_id in op_id_top with the
            # -- same index (column i in data_ma corresponds to op_id_top[i])

            data_ma = np.empty((data.shape[0], np.array(op_id_top).shape[0]))
            data_ma[:] = np.NaN
            for it, i in enumerate(ind):
                # -- if i is masked in ind that means that the current operation in data
                # -- is not part of op_id_top. We therefore do not need this operation to
                # -- be included in data_ma.
                if i is not np.ma.masked:
                    data_ma[:, i] = data[:, it]
        # -- otherwise pick all relevant features and also automatically sort them correctly (if necessary)
        else:
            data_ma = np.array(data[:, ind])

        # -- mask all NaN (not calculated) entries and stick them together
        #data_ma = np.ma.masked_invalid(data_ma)
        if is_first == True:
            data_all = data_ma
            is_first = False
        else:
            data_all = np.vstack((data_all, data_ma))
    # -- Saving a large masked array to disk can lead to Memory errors while using the pickle module.
    if is_return_masked == True:
        data_all = np.ma.masked_invalid(data_all)
    return data_all
Exemplo n.º 7
0
    def plot_stat_array(self):
        
        fig = plt.figure(figsize = ((15,15)))
        # -- plot layout ------------------------------------------------------
        
        rect_ustat_arr = [0.25,0.175,.5,.5]
        rect_dendr = [0.755,0.175,.145,.5]
        rect_measures0 = [0.25,0.68,0.5,0.1]
        rect_measures1 = [0.25,0.785,0.5,0.1]
        rect_measures2 = [0.25,0.89,0.5,0.1]
        
        ax_ustat_arr = fig.add_axes(rect_ustat_arr)
        ax_dendr = fig.add_axes(rect_dendr)
        ax_measures00 = fig.add_axes(rect_measures0)
        ax_measures01 = plt.twinx(ax_measures00) 
        ax_measures10 = fig.add_axes(rect_measures1)
        ax_measures10.set_xticklabels([])
        ax_measures20 = fig.add_axes(rect_measures2)
        
        ax_measures20.set_xticklabels([])
        ax_measures21 = plt.twinx(ax_measures20)
        
        # -- calculate and plot the dendrogram
        dist_dendrogram = hierarchy.dendrogram(self.linkage, orientation='left',no_plot=True)
        hierarchy.dendrogram(self.linkage, orientation='left',p=50,truncate_mode ='lastp',ax = ax_dendr)
        
        ax_dendr.set_yticks([])
        ax_dendr.axvline(self.max_dist_cluster,ls='--',c='k')

        # -- plot sorted classification stat array ------------------------------------------

        # -- create index that sort rows to correspond to dendrogram
        feat_sort_ind = dist_dendrogram['leaves']
        # -- sort the good performant features so they have the same order as the similarity array
        sort_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids,self.workflow.redundancy_method.good_perf_op_ids)
        self.ops_base_perf_vals = self.ops_base_perf_vals[:,sort_ind]
        
        # -- create index that sort columns with respect to their mean value
        task_sort_ind = np.argsort(self.ops_base_perf_vals[:,feat_sort_ind].mean(axis=1))
        
        #all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0))
        #all_classes_avg_top = fap.normalise_masked_array(self.ops_base_perf_vals, axis= 1,norm_type = 'zscore')[0]
        all_classes_avg_top = self.ops_base_perf_vals
        # -- plot the operation names as y-axis tick labels
        aspect = all_classes_avg_top.shape[0] / float(all_classes_avg_top.shape[1])
        ax_ustat_arr.matshow(all_classes_avg_top[task_sort_ind,:][:,feat_sort_ind].T,aspect=aspect,origin='bottom')


        ax_ustat_arr.set_yticks(range(len(feat_sort_ind)))

        op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks)
        names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids)
        ax_ustat_arr.set_yticklabels(np.array(names)[feat_sort_ind])
        # -- plot the problem names as x axis labels
        ax_ustat_arr.xaxis.tick_bottom()
        ax_ustat_arr.set_xticks(range(all_classes_avg_top.shape[0]))
        ax_ustat_arr.set_xticklabels(self.task_names[task_sort_ind],rotation='vertical')

        # -- plot clusters ----------------------------------
        
        cluster_bounds = np.nonzero(np.diff(self.workflow.redundancy_method.cluster_inds[feat_sort_ind]))[0]+0.5
        for cluster_bound in cluster_bounds:
            ax_ustat_arr.axhline(cluster_bound,c='w',lw=2)
    
        # --------------------------------------------------------------------------------
        # -- calculate and plot measures -------------------------------------------------
        # --------------------------------------------------------------------------------

        # -- nr samples and nr labels --------------------------------------------------
  
        n_samples_avg = [np.array(self.workflow.tasks[i].ts['n_samples']).mean() for i in task_sort_ind]
        n_classes = [len(set(self.workflow.tasks[i].labels)) for i in task_sort_ind]
        x_loc = np.arange(0,len(self.workflow.tasks))+0.5
        ax_measures00.scatter(x_loc,n_classes,c='b',s=40)
        ax_measures00.plot(x_loc,n_classes,c='b')
        [label.set_color('b') for label in ax_measures00.get_yticklabels()]
        ax_measures00.set_ylabel('nr classes')
        ax_measures00.yaxis.label.set_color('b')
        ax_measures00.set_ylim([0,max(n_classes)+1])
        ax_measures00.set_xticklabels([])
    
        ax_measures01.scatter(x_loc,n_samples_avg,c='r',s=40)
        ax_measures01.plot(x_loc,n_samples_avg,c='r')
        
        [label.set_color('r') for label in ax_measures01.get_yticklabels()]
        ax_measures01.set_ylabel('avg samples')
        ax_measures01.yaxis.label.set_color('r')
        ax_measures01.set_ylim([0,max(n_samples_avg)+100])
        
        ax_measures00.set_xlim([0,len(self.workflow.tasks)])
        ax_measures00.set_xticklabels([])
        

        # -- Classification stat measures --------------------------------------------------
        
        # -- minimum average classification stat for all features
        ax_measures10.plot(x_loc,np.min(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o',label='min. avg. classification stat all')

        # -- minimum average classification stat for top features
        ax_measures10.plot(x_loc,np.ma.min(self.ops_base_perf_vals[task_sort_ind,:],axis=1),marker='o',label='min. avg. classification stat top')
            
        # -- average minimum (for each class pair) classification stat for top features
        # XXX This would require task.pair_stats to be available (not saved as intermediate at the moment); then it is trivial to implement
        
        ax_measures10.legend(loc=2,fontsize='small',labelspacing=.1)
        ax_measures10.set_ylabel('classification stat')
        ax_measures10.set_xlim([0,len(self.workflow.tasks)])
        ax_measures10.set_ylim([0,0.5])

        # -- Classification stat measures and avg operations working--------------------------------------------------
        # -- mean average classification stat for all features
        ax_measures20.plot(x_loc,np.ma.mean(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o')
        [label.set_color('b') for label in ax_measures20.get_yticklabels()]
        ax_measures20.set_ylabel('avrg classification stat all feat')
        ax_measures20.yaxis.label.set_color('b')
        
        # -- number of successfully calculated features

        ax_measures21.plot(x_loc,[len(self.workflow.tasks[i].op['id']) for i in task_sort_ind],c='r',marker='o')
        [label.set_color('r') for label in ax_measures21.get_yticklabels()]
        ax_measures21.set_ylabel('nr calc feat')
        ax_measures21.yaxis.label.set_color('r')
        
        ax_measures20.set_xlim([0,len(self.workflow.tasks)])
Exemplo n.º 8
0
    def plot_stat_array(self):
        
        fig = plt.figure(figsize = ((15,15)))
        # -- plot layout ------------------------------------------------------
        
        rect_ustat_arr = [0.25,0.175,.5,.5]
        rect_dendr = [0.755,0.175,.145,.5]
        rect_measures0 = [0.25,0.68,0.5,0.1]
        rect_measures1 = [0.25,0.785,0.5,0.1]
        rect_measures2 = [0.25,0.89,0.5,0.1]
        
        ax_ustat_arr = fig.add_axes(rect_ustat_arr)
        ax_dendr = fig.add_axes(rect_dendr)
        ax_measures00 = fig.add_axes(rect_measures0)
        ax_measures01 = plt.twinx(ax_measures00) 
        ax_measures10 = fig.add_axes(rect_measures1)
        ax_measures10.set_xticklabels([])
        ax_measures20 = fig.add_axes(rect_measures2)
        
        ax_measures20.set_xticklabels([])
        ax_measures21 = plt.twinx(ax_measures20)
        
        # -- calculate and plot the dendrogram
        dist_dendrogram = hierarchy.dendrogram(self.linkage, orientation='left',no_plot=True)
        hierarchy.dendrogram(self.linkage, orientation='left',p=50,truncate_mode ='lastp',ax = ax_dendr)
        
        ax_dendr.set_yticks([])
        ax_dendr.axvline(self.max_dist_cluster,ls='--',c='k')

        # -- plot sorted U-Stat array ------------------------------------------

        # -- create index that sort rows to correspond to dendrogram
        feat_sort_ind = dist_dendrogram['leaves']
        # -- sort the good performant features so they have the same order as the similarity array
        sort_ind = hlp.ismember(self.workflow.redundancy_method.similarity_array_op_ids,self.workflow.redundancy_method.good_perf_op_ids)
        self.ops_base_perf_vals = self.ops_base_perf_vals[:,sort_ind]
        
        # -- create index that sort columns with respect to their mean value
        task_sort_ind = np.argsort(self.ops_base_perf_vals[:,feat_sort_ind].mean(axis=1))
        
        #all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0))
        #all_classes_avg_top = fap.normalise_masked_array(self.ops_base_perf_vals, axis= 1,norm_type = 'zscore')[0]
        all_classes_avg_top = self.ops_base_perf_vals
        # -- plot the operation names as y-axis tick labels
        aspect = all_classes_avg_top.shape[0] / float(all_classes_avg_top.shape[1])
        ax_ustat_arr.matshow(all_classes_avg_top[task_sort_ind,:][:,feat_sort_ind].T,aspect=aspect,origin='bottom')


        ax_ustat_arr.set_yticks(range(len(feat_sort_ind)))

        op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks)
        names = hlp.ind_map_subset(op_id_name_map[0],op_id_name_map[1],self.workflow.redundancy_method.similarity_array_op_ids)
        ax_ustat_arr.set_yticklabels(np.array(names)[feat_sort_ind])
        # -- plot the problem names as x axis labels
        ax_ustat_arr.xaxis.tick_bottom()
        ax_ustat_arr.set_xticks(range(all_classes_avg_top.shape[0]))
        ax_ustat_arr.set_xticklabels(self.task_names[task_sort_ind],rotation='vertical')

        # -- plot clusters ----------------------------------
        
        cluster_bounds = np.nonzero(np.diff(self.workflow.redundancy_method.cluster_inds[feat_sort_ind]))[0]+0.5
        for cluster_bound in cluster_bounds:
            ax_ustat_arr.axhline(cluster_bound,c='w',lw=2)
    
        # --------------------------------------------------------------------------------
        # -- calculate and plot measures -------------------------------------------------
        # --------------------------------------------------------------------------------

        # -- nr samples and nr labels --------------------------------------------------
  
        n_samples_avg = [np.array(self.workflow.tasks[i].ts['n_samples']).mean() for i in task_sort_ind]
        n_classes = [len(set(self.workflow.tasks[i].labels)) for i in task_sort_ind]
        x_loc = np.arange(0,len(self.workflow.tasks))+0.5
        ax_measures00.scatter(x_loc,n_classes,c='b',s=40)
        ax_measures00.plot(x_loc,n_classes,c='b')
        [label.set_color('b') for label in ax_measures00.get_yticklabels()]
        ax_measures00.set_ylabel('nr classes')
        ax_measures00.yaxis.label.set_color('b')
        ax_measures00.set_ylim([0,max(n_classes)+1])
        ax_measures00.set_xticklabels([])
    
        ax_measures01.scatter(x_loc,n_samples_avg,c='r',s=40)
        ax_measures01.plot(x_loc,n_samples_avg,c='r')
        
        [label.set_color('r') for label in ax_measures01.get_yticklabels()]
        ax_measures01.set_ylabel('avg samples')
        ax_measures01.yaxis.label.set_color('r')
        ax_measures01.set_ylim([0,max(n_samples_avg)+100])
        
        ax_measures00.set_xlim([0,len(self.workflow.tasks)])
        ax_measures00.set_xticklabels([])
        

        # -- U-stat measures --------------------------------------------------
        
        # -- minimum average U-score for all features
        ax_measures10.plot(x_loc,np.min(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o',label='min. avg. U-score all')

        # -- minimum average U-score for top features
        ax_measures10.plot(x_loc,np.ma.min(self.ops_base_perf_vals[task_sort_ind,:],axis=1),marker='o',label='min. avg. U-score top')
            
        # -- average minimum (for each class pair) U-score for top features
        # XXX This would require task.pair_stats to be available (not saved as intermediate at the moment); then it is trivial to implement
        
        ax_measures10.legend(loc=2,fontsize='small',labelspacing=.1)
        ax_measures10.set_ylabel('u-score')
        ax_measures10.set_xlim([0,len(self.workflow.tasks)])
        ax_measures10.set_ylim([0,0.5])

        # -- U-stat measures and avg operations working--------------------------------------------------
        # -- mean average U-score for all features
        ax_measures20.plot(x_loc,np.ma.mean(self.workflow.stats_good_op[task_sort_ind,:],axis=1),marker='o')
        [label.set_color('b') for label in ax_measures20.get_yticklabels()]
        ax_measures20.set_ylabel('avrg u-scrore all feat')
        ax_measures20.yaxis.label.set_color('b')
        
        # -- number of successfully calculated features

        ax_measures21.plot(x_loc,[len(self.workflow.tasks[i].op['id']) for i in task_sort_ind],c='r',marker='o')
        [label.set_color('r') for label in ax_measures21.get_yticklabels()]
        ax_measures21.set_ylabel('nr calc feat')
        ax_measures21.yaxis.label.set_color('r')
        
        ax_measures20.set_xlim([0,len(self.workflow.tasks)])
Exemplo n.º 9
0
def cat_data_op_subset(file_paths,op_id_top,is_from_old_matlab = False,is_return_masked = True):
    """
    Concatenate the features where op_id is in op_id_top for all HCTSA_loc.m files pointed to by file_paths.
    Warning, this can take a while and the returned data matrix can be very large.
    XXX WARNING XXX This only works correctly if all HCTSA_loc.mat files come from the same
    database. Meaning op_ids are the same. Otherwise one would have to go through operation names which is
    only a little more work to implement. XXX
    Parameters:
    -----------
    file_paths : list
        list of file paths pointing to the files containing the data
    op_id_top : list,ndarray
        list of operation ids wanted in the concatenated data array
    is_from_old_matlab : bool
        If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different.
    is_return_masked : boolean
        Saving large masked arrays to disk can lead to memory errors while pickling. If this is false funtion
         returns a normal ndarray with unknown entires are set to NaN. This can be converted to a masked array with 
         data_all = np.ma.masked_invalid(data_all)
    Returns:
    --------
    data_all : ndarray/masked ndarray
        Concatenated data array
   """
    is_first = True
    data_all = None

    for file_path in file_paths:
        print "Adding data from {:s} \n to complete data matrix".format(file_path)
        data,op = mIO.read_from_mat_file(file_path, ['TS_DataMat','Operations'],is_from_old_matlab = is_from_old_matlab)
 
        # -- find the indices in the data for for op_id_top
        ind = hlp.ismember(op['id'],op_id_top,is_return_masked_array = True,return_dtype = int)
        # -- if any of the operations was not calculated for this problem
        # -- create a masked array and copy only valid data and mask 
        # -- invalid data
        if ind.data != op_id_top:
            # -- create an masked array filled with NaN. 
            # -- This makes later masking of non-existent entries easier
            # -- each column of data_ma corresponds to the op_id in op_id_top with the
            # -- same index (column i in data_ma corresponds to op_id_top[i])

            data_ma = np.empty((data.shape[0],np.array(op_id_top).shape[0]))
            data_ma[:] = np.NaN
            for it,i in enumerate(ind):
                # -- if i is masked in ind that means that the current operation in data
                # -- is not part of op_id_top. We therefore do not need this operation to 
                # -- be included in data_ma.
                if i is not np.ma.masked:
                    data_ma[:,i] = data[:,it]
        # -- otherwise pick all relevant features and also automatically sort them correctly (if necessary)
        else:
            data_ma = np.array(data[:,ind])
        
        # -- mask all NaN (not calculated) entries and stick them together
        #data_ma = np.ma.masked_invalid(data_ma)
        if is_first == True:
            data_all = data_ma
            is_first = False
        else:
            data_all = np.vstack((data_all,data_ma))
    # -- Saving a large masked array to disk can lead to Memory errors while using the pickle module.
    if is_return_masked == True:
        data_all = np.ma.masked_invalid(data_all)
    return data_all
# -- number of successfully calculated features
ax_measures21.plot(x_loc[:-1], (~all_classes_avg_masked_sort.mask).sum(axis=1)[:-1], c="r", marker="o")
[label.set_color("r") for label in ax_measures21.get_yticklabels()]
ax_measures21.set_ylabel("nr calc feat")
ax_measures21.yaxis.label.set_color("r")

ax_measures20.set_xlim([0, problem_paths.shape[0]])

# -- Calculate the average min (for each label pair separately) score for every problem
if False:
    avg_min_u_score = np.ones(problem_paths.shape[0]) * np.NaN
    ustat_paths = np.array(glob.glob(intermediate_data_root + "/*_ustat.npy"))
    reg_ex = re.compile("../data/(.*)_ustat.npy")
    ustat_names = np.array([reg_ex.match(ustat_path).group(1) for ustat_path in ustat_paths])
    # -- sort ustat paths to match the problem_paths
    ustat_sort_ind = hlp.ismember(problem_names, ustat_names)
    ustat_paths = ustat_paths[ustat_sort_ind]
    ustat_names = ustat_names[ustat_sort_ind]

    for i, (ustat_path, mat_file_path) in enumerate(zip(ustat_paths, problem_paths)):
        ustat = np.load(ustat_path)
        # -- calculate the scaling factor for every label pairing of the current classification problem
        u_scale = testst.u_stat_norm_factor(mat_file_path, is_from_old_matlab="True")
        print ustat_path
        avg_min_u_score[i] = (np.min(ustat, axis=1) / u_scale).mean()
    np.save(avg_min_u_score_path, avg_min_u_score)
else:
    avg_min_u_score = np.load(avg_min_u_score_path)
# -- average minimum (for each class pair) U-score for top features
ax_measures10.plot(x_loc, avg_min_u_score[porblem_sort_ind], marker="o", label="avg. min. U-score all")
ax_measures10.legend(loc=2, fontsize="small", labelspacing=0.1)
Exemplo n.º 11
0
    def plot_stat_array(self):

        fig = plt.figure(figsize=((15, 15)))
        # -- plot layout ------------------------------------------------------

        #rect_ustat_arr = [0.01,0.01,0.75,0.75] #[0.25,0.175,.5,.5]
        #rect_dendr = [0.76,0.01,.2175,.75] #[0.755,0.175,.145,.5]
        rect_ustat_arr = fig.add_axes([0.15, 0.2, 0.7, 0.8])
        #rect_dendr = fig.add_axes([0.7, 0.1, 0.145, 0.8])
        rect_dendr = fig.add_axes([0.15, 0.8, 0.873, 0.2])
        '''rect_measures0 = [0.25,0.68,0.5,0.1]
        rect_measures1 = [0.25,0.785,0.5,0.1]
        rect_measures2 = [0.25,0.89,0.5,0.1]'''

        ax_ustat_arr = fig.add_axes(rect_ustat_arr)
        ax_dendr = fig.add_axes(rect_dendr)
        '''ax_measures00 = fig.add_axes(rect_measures0)
        ax_measures01 = plt.twinx(ax_measures00) 
        ax_measures10 = fig.add_axes(rect_measures1)
        ax_measures10.set_xticklabels([])
        ax_measures20 = fig.add_axes(rect_measures2)
        
        ax_measures20.set_xticklabels([])
        ax_measures21 = plt.twinx(ax_measures20)'''

        # -- calculate and plot the dendrogram
        dist_dendrogram = hierarchy.dendrogram(self.linkage,
                                               orientation='top',
                                               no_plot=True)
        hierarchy.dendrogram(self.linkage,
                             orientation='top',
                             p=50,
                             truncate_mode='lastp',
                             ax=ax_dendr)

        ax_dendr.set_xticks([])
        ax_dendr.axvline(self.max_dist_cluster, ls='--', c='k')

        # -- plot sorted classification stat array ------------------------------------------

        # -- create index that sort rows to correspond to dendrogram
        feat_sort_ind = dist_dendrogram['leaves']
        # -- sort the good performant features so they have the same order as the similarity array
        sort_ind = hlp.ismember(
            self.workflow.redundancy_method.similarity_array_op_ids,
            self.workflow.redundancy_method.good_perf_op_ids)
        self.ops_base_perf_vals = self.ops_base_perf_vals[:, sort_ind]
        # -- create index that sort columns with respect to their mean value
        task_sort_ind = np.argsort(
            self.ops_base_perf_vals[:, feat_sort_ind].mean(axis=1))

        #all_classes_avg_top = ((all_classes_avg_top - np.ma.mean(all_classes_avg_top,axis=0)) / np.ma.std(all_classes_avg_top,axis=0))
        #all_classes_avg_top = fap.normalise_masked_array(self.ops_base_perf_vals, axis= 1,norm_type = 'zscore')[0]
        all_classes_avg_top = self.ops_base_perf_vals
        # -- plot the operation names as y-axis tick labels
        aspect = all_classes_avg_top.shape[0] / float(
            all_classes_avg_top.shape[1])
        im = ax_ustat_arr.matshow(
            all_classes_avg_top[task_sort_ind, :][:, feat_sort_ind],
            aspect=aspect,
            origin='bottom',
            cmap='turbo')

        ax_ustat_arr.set_yticks(range(all_classes_avg_top.shape[0]))

        op_id_name_map = self.map_op_id_name_mult_task(self.workflow.tasks)
        names = hlp.ind_map_subset(
            op_id_name_map[0], op_id_name_map[1],
            self.workflow.redundancy_method.similarity_array_op_ids)
        ax_ustat_arr.set_yticklabels(self.task_names[task_sort_ind])
        # -- plot the problem names as x axis labels
        ax_ustat_arr.xaxis.tick_bottom()
        ax_ustat_arr.set_xticks(range(len(feat_sort_ind)))
        ax_ustat_arr.set_xticklabels(np.array(names)[feat_sort_ind],
                                     rotation='vertical')

        fig.colorbar(im)

        # -- plot clusters ----------------------------------
        '''cluster_bounds = np.nonzero(np.diff(self.workflow.redundancy_method.cluster_inds[feat_sort_ind]))[0]+0.5
Exemplo n.º 12
0
                   marker='o')
[label.set_color('r') for label in ax_measures21.get_yticklabels()]
ax_measures21.set_ylabel('nr calc feat')
ax_measures21.yaxis.label.set_color('r')

ax_measures20.set_xlim([0, problem_paths.shape[0]])

# -- Calculate the average min (for each label pair separately) score for every problem
if False:
    avg_min_u_score = np.ones(problem_paths.shape[0]) * np.NaN
    ustat_paths = np.array(glob.glob(intermediate_data_root + '/*_ustat.npy'))
    reg_ex = re.compile('../data/(.*)_ustat.npy')
    ustat_names = np.array(
        [reg_ex.match(ustat_path).group(1) for ustat_path in ustat_paths])
    # -- sort ustat paths to match the problem_paths
    ustat_sort_ind = hlp.ismember(problem_names, ustat_names)
    ustat_paths = ustat_paths[ustat_sort_ind]
    ustat_names = ustat_names[ustat_sort_ind]

    for i, (ustat_path,
            mat_file_path) in enumerate(zip(ustat_paths, problem_paths)):
        ustat = np.load(ustat_path)
        # -- calculate the scaling factor for every label pairing of the current classification problem
        u_scale = testst.u_stat_norm_factor(mat_file_path,
                                            is_from_old_matlab='True')
        print ustat_path
        avg_min_u_score[i] = (np.min(ustat, axis=1) / u_scale).mean()
    np.save(avg_min_u_score_path, avg_min_u_score)
else:
    avg_min_u_score = np.load(avg_min_u_score_path)
# -- average minimum (for each class pair) U-score for top features