Пример #1
0
 def input_task(self,task_name,is_read_feature_data = True):
     """
     Read the required data from a HCTSA_loc.mat file
     Parameters:
     -----------
     task_name : string
         the name of the classification task to be imported
     is_read_feature_data : bool
         if true, the feature data matrix will be read (default is True)
     Returns:
     --------
     data : ndarray
         Array containing the data. Each row corresponds to a timeseries and each column to an operation.
     ts : dict
         dictionary containing the information for all timeseries (rows in data). 
         ['keywords', 'n_samples', 'id', 'filename']
     op : dict
         dictionary containing the information for all contained operations. 
         Keys are ['keywords', 'master_id', 'id', 'code_string', 'name']
         
     """
     # -- assemble the file path
     mat_file_path = self.path_pattern.format(task_name)
     # -- load the data,operations and timeseries from the matlab file
     if is_read_feature_data:
         data , op, ts = mIO.read_from_mat_file(mat_file_path,['TS_DataMat','Operations','TimeSeries'],is_from_old_matlab = True)
     else:
         op, ts = mIO.read_from_mat_file(mat_file_path,['Operations','TimeSeries'],is_from_old_matlab = True)
         data = None
     
     if is_read_feature_data:
         return self.masking_method(data), ts, op
     else:
         return None, ts, op
Пример #2
0
def count_op_calc(mat_file_paths, is_from_old_matlab=False):
    """
    Counts how many times every operation has been calculated successfully for each problem
    represented by a HCTSA_loc.mat file in root_dir
    Parameters:
    ----------
    mat_file_paths : list
        Paths to the HCTSA_loc.mat files corresponding to the problems considered.   
    is_from_old_matlab : bool
        If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different.
    Returns:
    --------
    count_op_calc_all_problems : ndarray
        Array where each entry represents one operation and each value is the number of successful
        calculations of the corresponding operation for the given problems.
    """

    count_op_calc_all_problems = np.zeros(10000)
    for mat_file_path in mat_file_paths:

        op, = mIO.read_from_mat_file(mat_file_path, ['Operations'],
                                     is_from_old_matlab=is_from_old_matlab)
        print "Counting which operations calculated in: {:s}".format(
            mat_file_path)

        count_op_calc_all_problems[op['id']] += 1
    return count_op_calc_all_problems
Пример #3
0
def count_op_calc(mat_file_paths,is_from_old_matlab = False):
    """
    Counts how many times every operation has been calculated successfully for each problem
    represented by a HCTSA_loc.mat file in root_dir
    Parameters:
    ----------
    mat_file_paths : list
        Paths to the HCTSA_loc.mat files corresponding to the problems considered.   
    is_from_old_matlab : bool
        If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different.
    Returns:
    --------
    count_op_calc_all_problems : ndarray
        Array where each entry represents one operation and each value is the number of successful
        calculations of the corresponding operation for the given problems.
    """    

    count_op_calc_all_problems = np.zeros(10000)
    for mat_file_path in mat_file_paths:
        
        op, = mIO.read_from_mat_file(mat_file_path,['Operations'],is_from_old_matlab = is_from_old_matlab )
        print "Counting which operations calculated in: {:s}".format(mat_file_path)
        
        count_op_calc_all_problems[op['id']]+=1
    return count_op_calc_all_problems
Пример #4
0
def best_noncorr_op_ind(ind_dict,mask,file_path,op = None,is_from_old_matlab = False):
    """
    Compute the indices for the top features for a specific HCTSA_loc.mat file
    and the corresponding operation ids
    Parameters:
    -----------
    ind_dict : dict
        Dictionary where keys are file paths and values are the indices in the data matrix 
        of HCTSA_loc.mat
    mask : array like
        Mask to reduce the indices given in ind_dict
    file_path : string
        Path to HCTSA_loc.mat file
    op : dict,optional
        Operations dictionary from HCTSA_loc.mat file at file_path 
    is_from_old_matlab : bool
        If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different.

    Returns:
    --------
    ind_top : array
        Indices of the features combining the information ind_dict and mask for the HCTSA_loc.mat
        file pointed to by file_path
    op_id_top : array
        Operation ids corresponding to ind_top
    """
    ind = np.array(ind_dict[file_path])
    if op == None:
        op, = mIO.read_from_mat_file(file_path, ['Operations'],is_from_old_matlab = is_from_old_matlab)
    op_id_top = np.array(op['id'])[ind][mask]
    ind_top = ind[mask]
    return ind_top,op_id_top
Пример #5
0
    def input_task_master(self,
                          task_name,
                          is_read_feature_data=True,
                          old_matlab=False):
        """
        Read the required data from a HCTSA_loc.mat file and master operations
        Parameters:
        -----------
        task_name : string
            the name of the classification task to be imported
        is_read_feature_data : bool
            if true, the feature data matrix will be read (default is True)
        Returns:
        --------
        data : ndarray
            Array containing the data. Each row corresponds to a timeseries and each column to an operation.
        ts : dict
            dictionary containing the information for all timeseries (rows in data).
            ['keywords', 'n_samples', 'id', 'filename']
        op : dict
            dictionary containing the information for all contained operations.
            Keys are ['keywords', 'master_id', 'id', 'code_string', 'name']

        """
        # -- assemble the file path
        mat_file_path = self.path_pattern.format(task_name)
        print "Reading file {}".format(mat_file_path)

        # -- load the data,operations and timeseries from the matlab file
        if is_read_feature_data:
            data, op, ts, m_op = mIO.read_from_mat_file(
                mat_file_path,
                ['TS_DataMat', 'Operations', 'TimeSeries', 'MasterOperations'],
                is_from_old_matlab=old_matlab)
        else:
            op, ts, m_op = mIO.read_from_mat_file(
                mat_file_path,
                ['Operations', 'TimeSeries', 'MasterOperations'],
                is_from_old_matlab=old_matlab)
            data = None

        if is_read_feature_data:
            return self.masking_method(data), ts, op, m_op
        else:
            return None, ts, op, m_op
Пример #6
0
def calculate_ustat_avg_mult_task(mat_file_paths,
                                  u_stat_file_paths,
                                  all_classes_avg_out_path='./',
                                  is_from_old_matlab=False):
    """
    For multiple tasks calculate the u statistics for each task averaged over all possible label pairs.
    The results are saved to disk.
    Parameters:
    -----------
    mat_file_paths : list
        List of file paths to the MAT files containing the HCTSA data.
    u_stat_file_paths : list
        File paths of the saved u statistics data in binary npy files.
    all_classes_avg_out_path : string
        Path to the output folder in which the tasks average u statistics are saved.
    is_from_old_matlab : boolean
        Are the MAT files from older version of the comp engine
    Returns:
    --------
    all_classes_avg : ndarray
        ndarray where each row represents a task and column i represents operation with op_id = i.

    """

    # -- initialise the array containing the average u-statistic values for all problems and features
    all_classes_avg = np.ones((len(u_stat_file_paths), 10000)) * np.NAN

    for i, (u_stat_file_path,
            mat_file_path) in enumerate(zip(u_stat_file_paths,
                                            mat_file_paths)):

        # -- load the u statistic for every operation and label pairing
        u_stat = np.load(u_stat_file_path)

        # -- calculate the scaling factor for every label pairing of the current classification problem
        u_scale = u_stat_norm_factor(mat_file_path,
                                     is_from_old_matlab=is_from_old_matlab)

        # -- calculate the average scaled u statistic over all label pairs in current problem
        u_stat_avg = (u_stat.T / u_scale).transpose().mean(axis=0)

        # -- save the average scaled u-statistic for all features to the all_classes_avg array.
        #    The column number corresponds with the operation id
        op, = mIO.read_from_mat_file(mat_file_path, ['Operations'],
                                     is_from_old_matlab=is_from_old_matlab)
        all_classes_avg[i, op['id']] = u_stat_avg

    np.save(all_classes_avg_out_path, all_classes_avg)
    return all_classes_avg
Пример #7
0
def calculate_ustat_avg_mult_task(mat_file_paths,u_stat_file_paths,all_classes_avg_out_path = './',is_from_old_matlab = False):
    
    """
    For multiple tasks calculate the u statistics for each task averaged over all possible label pairs.
    The results are saved to disk.
    Parameters:
    -----------
    mat_file_paths : list
        List of file paths to the MAT files containing the HCTSA data.
    u_stat_file_paths : list
        File paths of the saved u statistics data in binary npy files.
    all_classes_avg_out_path : string
        Path to the output folder in which the tasks average u statistics are saved.
    is_from_old_matlab : boolean
        Are the MAT files from older version of the comp engine
    Returns:
    --------
    all_classes_avg : ndarray
        ndarray where each row represents a task and column i represents operation with op_id = i.
    
    """  
   
    # -- initialise the array containing the average u-statistic values for all problems and features
    all_classes_avg = np.ones((len(u_stat_file_paths),10000))*np.NAN
    
    for i,(u_stat_file_path, mat_file_path) in enumerate(zip(u_stat_file_paths,mat_file_paths)):
        
        # -- load the u statistic for every operation and label pairing
        u_stat = np.load(u_stat_file_path)

        # -- calculate the scaling factor for every label pairing of the current classification problem
        u_scale = u_stat_norm_factor(mat_file_path,is_from_old_matlab = is_from_old_matlab)

        # -- calculate the average scaled u statistic over all label pairs in current problem 
        u_stat_avg = (u_stat.T/u_scale).transpose().mean(axis=0)
        
        # -- save the average scaled u-statistic for all features to the all_classes_avg array. 
        #    The column number corresponds with the operation id
        op, = mIO.read_from_mat_file(mat_file_path,['Operations'],is_from_old_matlab = is_from_old_matlab )
        all_classes_avg[i,op['id']] = u_stat_avg
    
    np.save(all_classes_avg_out_path,all_classes_avg)
    return all_classes_avg 
Пример #8
0
def u_stat_norm_factor(file_name, is_from_old_matlab=False):
    """
    Return the u statisitc scaling factor n_1*n_2 where n_i is the
    number of time series with label i. Every entry corresponds to
    one label pairing in the classification problem pointed to
    by file_name.
    Parameters:
    -----------
    file_name : string
        Filename of HCTSA_loc.mat file containing data of the current problem
    is_from_old_matlab : bool
        If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different.

    Returns:
    --------
    u_scale : array
        Scaling factor for u statistic for every label pairing in the current
        classification problem
    """
    ts, = mIO.read_from_mat_file(file_name, ['TimeSeries'],
                                 is_from_old_matlab=is_from_old_matlab)
    labels = [int(x.split(',')[-1]) for x in ts['keywords']]

    labels_unique = list(set(labels))
    labels = np.array(labels)
    n_labels = len(labels_unique)
    nr_items_label = []

    # -- for every label calculate the number of time series with this label
    for i, label in enumerate(labels_unique):
        nr_items_label.append(np.nonzero((labels == label))[0].shape[0])

    # -- initialise the u_scale array for all label pairings
    u_scale = np.zeros(n_labels * (n_labels - 1) / 2)

    # -- calculate the scaling factor for the u statistic for every label pairing
    for i, (label_ind_0, label_ind_1) in enumerate(
            itertools.combinations(range(n_labels), 2)):
        u_scale[i] = nr_items_label[label_ind_0] * nr_items_label[label_ind_1]

    return u_scale
Пример #9
0
def u_stat_norm_factor(file_name,is_from_old_matlab = False):
    """
    Return the u statisitc scaling factor n_1*n_2 where n_i is the 
    number of time series with label i. Every entry corresponds to 
    one label pairing in the classification problem pointed to 
    by file_name.
    Parameters:
    -----------
    file_name : string
        Filename of HCTSA_loc.mat file containing data of the current problem    
    is_from_old_matlab : bool
        If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different.

    Returns:
    --------
    u_scale : array
        Scaling factor for u statistic for every label pairing in the current
        classification problem
    """
    ts, = mIO.read_from_mat_file(file_name,['TimeSeries'],is_from_old_matlab = is_from_old_matlab )
    labels = [int(x.split(',')[-1]) for x in ts['keywords']]
    
    labels_unique = list(set(labels))
    labels = np.array(labels)
    n_labels = len(labels_unique)
    nr_items_label = []
    
    # -- for every label calculate the number of time series with this label
    for i,label in enumerate(labels_unique):
        nr_items_label.append(np.nonzero((labels == label))[0].shape[0])
    
    # -- initialise the u_scale array for all label pairings    
    u_scale = np.zeros(n_labels * (n_labels-1)/2 )
    
    # -- calculate the scaling factor for the u statistic for every label pairing
    for i,(label_ind_0,label_ind_1) in enumerate(itertools.combinations(range(n_labels),2)):
        u_scale[i] = nr_items_label[label_ind_0]*nr_items_label[label_ind_1]
        
    return u_scale
Пример #10
0
def best_noncorr_op_ind(ind_dict,
                        mask,
                        file_path,
                        op=None,
                        is_from_old_matlab=False):
    """
    Compute the indices for the top features for a specific HCTSA_loc.mat file
    and the corresponding operation ids
    Parameters:
    -----------
    ind_dict : dict
        Dictionary where keys are file paths and values are the indices in the data matrix 
        of HCTSA_loc.mat
    mask : array like
        Mask to reduce the indices given in ind_dict
    file_path : string
        Path to HCTSA_loc.mat file
    op : dict,optional
        Operations dictionary from HCTSA_loc.mat file at file_path 
    is_from_old_matlab : bool
        If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different.

    Returns:
    --------
    ind_top : array
        Indices of the features combining the information ind_dict and mask for the HCTSA_loc.mat
        file pointed to by file_path
    op_id_top : array
        Operation ids corresponding to ind_top
    """
    ind = np.array(ind_dict[file_path])
    if op == None:
        op, = mIO.read_from_mat_file(file_path, ['Operations'],
                                     is_from_old_matlab=is_from_old_matlab)
    op_id_top = np.array(op['id'])[ind][mask]
    ind_top = ind[mask]
    return ind_top, op_id_top
Пример #11
0
def cat_data_op_subset(file_paths,
                       op_id_top,
                       is_from_old_matlab=False,
                       is_return_masked=True):
    """
    Concatenate the features where op_id is in op_id_top for all HCTSA_loc.m files pointed to by file_paths.
    Warning, this can take a while and the returned data matrix can be very large.
    XXX WARNING XXX This only works correctly if all HCTSA_loc.mat files come from the same
    database. Meaning op_ids are the same. Otherwise one would have to go through operation names which is
    only a little more work to implement. XXX
    Parameters:
    -----------
    file_paths : list
        list of file paths pointing to the files containing the data
    op_id_top : list,ndarray
        list of operation ids wanted in the concatenated data array
    is_from_old_matlab : bool
        If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different.
    is_return_masked : boolean
        Saving large masked arrays to disk can lead to memory errors while pickling. If this is false funtion
         returns a normal ndarray with unknown entires are set to NaN. This can be converted to a masked array with 
         data_all = np.ma.masked_invalid(data_all)
    Returns:
    --------
    data_all : ndarray/masked ndarray
        Concatenated data array
   """
    is_first = True
    data_all = None

    for file_path in file_paths:
        print "Adding data from {:s} \n to complete data matrix".format(
            file_path)
        data, op = mIO.read_from_mat_file(
            file_path, ['TS_DataMat', 'Operations'],
            is_from_old_matlab=is_from_old_matlab)

        # -- find the indices in the data for for op_id_top
        ind = hlp.ismember(op['id'],
                           op_id_top,
                           is_return_masked_array=True,
                           return_dtype=int)
        # -- if any of the operations was not calculated for this problem
        # -- create a masked array and copy only valid data and mask
        # -- invalid data
        if ind.data != op_id_top:
            # -- create an masked array filled with NaN.
            # -- This makes later masking of non-existent entries easier
            # -- each column of data_ma corresponds to the op_id in op_id_top with the
            # -- same index (column i in data_ma corresponds to op_id_top[i])

            data_ma = np.empty((data.shape[0], np.array(op_id_top).shape[0]))
            data_ma[:] = np.NaN
            for it, i in enumerate(ind):
                # -- if i is masked in ind that means that the current operation in data
                # -- is not part of op_id_top. We therefore do not need this operation to
                # -- be included in data_ma.
                if i is not np.ma.masked:
                    data_ma[:, i] = data[:, it]
        # -- otherwise pick all relevant features and also automatically sort them correctly (if necessary)
        else:
            data_ma = np.array(data[:, ind])

        # -- mask all NaN (not calculated) entries and stick them together
        #data_ma = np.ma.masked_invalid(data_ma)
        if is_first == True:
            data_all = data_ma
            is_first = False
        else:
            data_all = np.vstack((data_all, data_ma))
    # -- Saving a large masked array to disk can lead to Memory errors while using the pickle module.
    if is_return_masked == True:
        data_all = np.ma.masked_invalid(data_all)
    return data_all
Пример #12
0
all_classes_avg_out_path = intermediate_data_root + '/all_classes_avg.npy'
op_id_good_path = intermediate_data_root + '/op_id_good.npy'
op_id_order_path = intermediate_data_root + '/op_id_order.npy'
sort_good_ind_path = intermediate_data_root + 'sort_good_ind.npy'

# -- Load the data
all_classes_avg = np.load(all_classes_avg_out_path)
op_id_good = np.load(op_id_good_path)

# -- Mask NaN entires
all_classes_avg_good = np.ma.masked_invalid(all_classes_avg[:, op_id_good])

# -- load a reference HCTSA_loc.mat containing all op_ids
import modules.misc.PK_matlab_IO as mIO
op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat'
op, = mIO.read_from_mat_file(op_ref_HCTSA_path, ['Operations'],
                             is_from_old_matlab=True)

max_feat = 50
# -- calculate the correlation
abs_corr_array, sort_good_ind, all_classes_avg_good_norm = idtop.calc_perform_corr_mat(
    all_classes_avg_good, norm='z-score', max_feat=max_feat)

# -- save the op id's in order of performance (first entry = best performance)
np.save(op_id_order_path, op_id_good[sort_good_ind])
# -- sort the permutation vector that would sort the data array containing the good operations only
np.save(sort_good_ind_path, sort_good_ind)

# -- extract the top feature names
names = hlp.ind_map_subset(op['id'], op['name'],
                           op_id_good[sort_good_ind][:max_feat])
# all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:,op_id_order[:100]])
# # -- calculate the z-score of the u stat array
# all_classes_avg_top = ((all_classes_avg_top.T - np.ma.mean(all_classes_avg_top,axis=1)) / np.ma.std(all_classes_avg_top,axis=1)).T
# abs_corr_array = np.abs(np.ma.corrcoef(all_classes_avg_top, rowvar=0)) 

# -- calculate the correlation array with respect to performance and mask nan.
abs_corr_array,sort_good_ind,all_classes_avg_good_norm = idtop.calc_perform_corr_mat(all_classes_avg[:,op_id_good],norm='z-score', max_feat = max_feat)
all_classes_avg_top = np.ma.masked_invalid(all_classes_avg[:,op_id_good][:,sort_good_ind[:max_feat]])

# -- calculate the linkage for the correlation
corr_linkage = idtop.calc_linkage(abs_corr_array)[0]

# -- extract operation names --- ------------------------------------------
# -- load a reference HCTSA_loc.mat containing all op_ids
op_ref_HCTSA_path = '/home/philip/work/OperationImportanceProject/results/done/HCTSA_Beef.mat'
op, = mIO.read_from_mat_file(op_ref_HCTSA_path,['Operations'],is_from_old_matlab = True)   

top_id = op_id_good[sort_good_ind][:max_feat]
names = hlp.ind_map_subset(op['id'], op['name'], op_id_good[sort_good_ind][:max_feat])

# -- extract problem names --- ------------------------------------------
reg_ex = re.compile('.*\/HCTSA_(.*)_N_70_100_reduced.mat')
problem_paths = np.load(problem_names_path)

problem_names = np.array([reg_ex.match(problem_path).group(1) for problem_path in problem_paths])


# ---------------------------------------------------------------------
# -- Plot -------------------------------------------------------------
# ---------------------------------------------------------------------
Пример #14
0
def cat_data_op_subset(file_paths,op_id_top,is_from_old_matlab = False,is_return_masked = True):
    """
    Concatenate the features where op_id is in op_id_top for all HCTSA_loc.m files pointed to by file_paths.
    Warning, this can take a while and the returned data matrix can be very large.
    XXX WARNING XXX This only works correctly if all HCTSA_loc.mat files come from the same
    database. Meaning op_ids are the same. Otherwise one would have to go through operation names which is
    only a little more work to implement. XXX
    Parameters:
    -----------
    file_paths : list
        list of file paths pointing to the files containing the data
    op_id_top : list,ndarray
        list of operation ids wanted in the concatenated data array
    is_from_old_matlab : bool
        If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different.
    is_return_masked : boolean
        Saving large masked arrays to disk can lead to memory errors while pickling. If this is false funtion
         returns a normal ndarray with unknown entires are set to NaN. This can be converted to a masked array with 
         data_all = np.ma.masked_invalid(data_all)
    Returns:
    --------
    data_all : ndarray/masked ndarray
        Concatenated data array
   """
    is_first = True
    data_all = None

    for file_path in file_paths:
        print "Adding data from {:s} \n to complete data matrix".format(file_path)
        data,op = mIO.read_from_mat_file(file_path, ['TS_DataMat','Operations'],is_from_old_matlab = is_from_old_matlab)
 
        # -- find the indices in the data for for op_id_top
        ind = hlp.ismember(op['id'],op_id_top,is_return_masked_array = True,return_dtype = int)
        # -- if any of the operations was not calculated for this problem
        # -- create a masked array and copy only valid data and mask 
        # -- invalid data
        if ind.data != op_id_top:
            # -- create an masked array filled with NaN. 
            # -- This makes later masking of non-existent entries easier
            # -- each column of data_ma corresponds to the op_id in op_id_top with the
            # -- same index (column i in data_ma corresponds to op_id_top[i])

            data_ma = np.empty((data.shape[0],np.array(op_id_top).shape[0]))
            data_ma[:] = np.NaN
            for it,i in enumerate(ind):
                # -- if i is masked in ind that means that the current operation in data
                # -- is not part of op_id_top. We therefore do not need this operation to 
                # -- be included in data_ma.
                if i is not np.ma.masked:
                    data_ma[:,i] = data[:,it]
        # -- otherwise pick all relevant features and also automatically sort them correctly (if necessary)
        else:
            data_ma = np.array(data[:,ind])
        
        # -- mask all NaN (not calculated) entries and stick them together
        #data_ma = np.ma.masked_invalid(data_ma)
        if is_first == True:
            data_all = data_ma
            is_first = False
        else:
            data_all = np.vstack((data_all,data_ma))
    # -- Saving a large masked array to disk can lead to Memory errors while using the pickle module.
    if is_return_masked == True:
        data_all = np.ma.masked_invalid(data_all)
    return data_all
Пример #15
0
def u_stat_all_label_file_name(file_name, mask=None, is_from_old_matlab=False):
    """
    Calculate the U-statistic for all label pairings in a HCTSA_loc.mat file. The operations can be masked
    by a boolean array.
    Parameters:
    -----------
    file_name : string
        File name of the HCTSA_loc.mat file containing as least the matrices 'TimeSeries' and 'TS_DataMat'
    mask : ndarray dtype='bool', optional
        If given this acts as mask to which features are included in the calculation
    is_from_old_matlab : bool
        If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different.

    Returns:
    --------
    ranks : ndarray of dim (n_labels * (n_labels-1) / 2, nr_timeseries)
        The U-statistic for all features and label pairs, where a row represents a pair of labels.
        Every column represents one feature
    labels_unique : list
        List of all unique labels
    label_ind_list : list
        List of lists where each sub-list i represents all row-indices in data containing timeseries
        for labels_unique[i]
    """
    # ---------------------------------------------------------------------
    # load the data
    # ---------------------------------------------------------------------
    ts, data = mIO.read_from_mat_file(file_name, ['TimeSeries', 'TS_DataMat'],
                                      is_from_old_matlab=is_from_old_matlab)

    # ---------------------------------------------------------------------
    # mask the data if required
    # ---------------------------------------------------------------------
    if mask != None:
        data = data[:, mask]

    # ---------------------------------------------------------------------
    # extract the unique labels
    # ---------------------------------------------------------------------
    labels = [int(x.split(',')[-1]) for x in ts['keywords']]
    labels_unique = list(set(labels))

    labels = np.array(labels)
    label_ind_list = []

    # ---------------------------------------------------------------------
    # get indices for all unique labels
    # ---------------------------------------------------------------------
    for i, label in enumerate(labels_unique):
        label_ind_list.append(np.nonzero((labels == label))[0])
    n_labels = len(label_ind_list)

    # ---------------------------------------------------------------------
    # calculate Mann-Whitney u-test
    # ---------------------------------------------------------------------
    ranks = np.zeros((n_labels * (n_labels - 1) / 2, data.shape[1]))

    for i, (label_ind_0, label_ind_1) in enumerate(
            itertools.combinations(range(n_labels), 2)):
        # -- select the data for the current labels
        data_0 = data[label_ind_list[label_ind_0], :]
        data_1 = data[label_ind_list[label_ind_1], :]
        print i + 1, '/', n_labels * (n_labels - 1) / 2
        for k in range(0, data.shape[1]):
            # -- in the case of same value for every feature in both arrays set max possible value
            if np.ma.all((data_0[:, k] == data_0[0, k])) and np.ma.all(
                (data_1[:, k] == data_0[0, k])):
                ranks[i,
                      k] = data_0[:, k].shape[0] * data_1[:, k].shape[0] / 2.
            else:
                ranks[i, k] = stats.mannwhitneyu(data_0[:, k], data_1[:, k])[0]

    return ranks, labels_unique, label_ind_list
Пример #16
0
def u_stat_all_label_file_name(file_name,mask = None, is_from_old_matlab=False):
    """
    Calculate the U-statistic for all label pairings in a HCTSA_loc.mat file. The operations can be masked 
    by a boolean array.
    Parameters:
    -----------
    file_name : string
        File name of the HCTSA_loc.mat file containing as least the matrices 'TimeSeries' and 'TS_DataMat'
    mask : ndarray dtype='bool', optional
        If given this acts as mask to which features are included in the calculation
    is_from_old_matlab : bool
        If the HCTSA_loc.mat files are saved from an older version of the comp engine. The order of entries is different.

    Returns:
    --------
    ranks : ndarray of dim (n_labels * (n_labels-1) / 2, nr_timeseries)
        The U-statistic for all features and label pairs, where a row represents a pair of labels.
        Every column represents one feature    
    labels_unique : list
        List of all unique labels
    label_ind_list : list 
        List of lists where each sub-list i represents all row-indices in data containing timeseries
        for labels_unique[i]
    """
    # ---------------------------------------------------------------------
    # load the data
    # ---------------------------------------------------------------------
    ts,data = mIO.read_from_mat_file(file_name,['TimeSeries','TS_DataMat'],is_from_old_matlab = is_from_old_matlab )
    
    # ---------------------------------------------------------------------
    # mask the data if required
    # ---------------------------------------------------------------------   
    if mask != None:
        data = data[:,mask]
    
    # ---------------------------------------------------------------------
    # extract the unique labels
    # ---------------------------------------------------------------------
    labels = [int(x.split(',')[-1]) for x in ts['keywords']]
    labels_unique = list(set(labels))
    
    labels = np.array(labels)
    label_ind_list = []
    
    # ---------------------------------------------------------------------
    # get indices for all unique labels
    # ---------------------------------------------------------------------
    for i,label in enumerate(labels_unique):
        label_ind_list.append(np.nonzero((labels == label))[0])
    n_labels = len(label_ind_list)
    
    # ---------------------------------------------------------------------
    # calculate Mann-Whitney u-test
    # ---------------------------------------------------------------------
    ranks = np.zeros((n_labels * (n_labels-1) / 2,data.shape[1]))
    
    for i,(label_ind_0,label_ind_1) in enumerate(itertools.combinations(range(n_labels),2)):
        # -- select the data for the current labels
        data_0 = data[label_ind_list[label_ind_0],:]
        data_1 = data[label_ind_list[label_ind_1],:]
        print i+1,'/',n_labels * (n_labels-1) / 2
        for k in range(0,data.shape[1]):
            # -- in the case of same value for every feature in both arrays set max possible value
            if np.ma.all((data_0[:,k] == data_0[0,k])) and np.ma.all((data_1[:,k] == data_0[0,k] )):
                ranks[i,k] = data_0[:,k].shape[0] * data_1[:,k].shape[0]/2.
            else:
                ranks[i,k] = stats.mannwhitneyu(data_0[:,k], data_1[:,k])[0]

    return ranks,labels_unique,label_ind_list