def test_init_var_indicators(self): # Test forward forward = True var1_index = 1 var2_index = 2 samp_var1 = np.array([[x for x in range(5)] for x in range(5)]) samp_var2 = np.array([[x for x in range(5)] for x in range(5)]) exceeds, reverse, extrema_p, extrema_r, var1, var2 = utils.init_var_indicators( var1_index, var2_index, samp_var1, samp_var2, forward) assert np.all(exceeds == 0) assert np.all(reverse == 0) assert np.all(extrema_p == 0) assert np.all(extrema_r == 1, axis=0) assert np.all(var1 == 1) assert np.all(var2 == 2) # Test reverse forward = False var1_index = 4 var2_index = 3 samp_var1 = np.array([[x for x in range(5)] for x in range(5)]) samp_var2 = np.array([[x for x in range(5)] for x in range(5)]) exceeds, reverse, extrema_p, extrema_r, var1, var2 = utils.init_var_indicators( var1_index, var2_index, samp_var1, samp_var2, forward) assert np.all(exceeds == 0) assert np.all(reverse == 0) assert np.all(extrema_p == 1) assert np.all(extrema_r == 0) assert np.all(var1 == 4) assert np.all(var2 == 3)
def test_update_rev_extrema_rp(self): # tests updating of indicator arrays exceeds, reverse, extrema_p, extrema_r, var1, var2 = utils.init_var_indicators( self.var1, self.var2, self.samp_var1, self.samp_var2, self.forward) combs = [list(x) for x in itertools.combinations(range(self.n_samp), self.resample_k)] for indices in combs: # ~ operator negates the output in the case of a boolean array new_var1 = var1[~np.in1d(range(len(var1)), indices)] new_var2 = var2[~np.in1d(range(len(var2)), indices)] p_value, r_value = statistics.compute_pc(new_var1, new_var2) assert_almost_equal(self.update_results[str(indices)], statistics.update_rev_extrema_rp( self.sign, r_value, p_value, indices, reverse, extrema_p, extrema_r, self.forward), decimal=5)
def resamplek_cutie(var1_index, var2_index, n_samp, samp_var1, samp_var2, pvalues, corrs, threshold, resample_k, sign, forward, statistic, fold, fold_value, param): """ Perform CUTIE resampling on a given pair of variables and test CUTIE status. ---------------------------------------------------------------------------- INPUTS var1_index - Integer. Index of variable in file 1. var2_index - Integer. Index of variable in file 2. n_samp - Integer. Number of samples. samp_var1 - 2D array. Each value in row i col j is the level of variable j corresponding to sample i in the order that the samples are presented in samp_ids when parsed. samp_var2 - 2D array. Same as samp_var1 but for file 2. pvalues - 2D array. Entry row i, col j represents p value of correlation between i-th var1 and j-th var2. corrs - 2D array. Contains values of correlation strength between var i and var j. threshold - Float. Level of significance testing (after adjusting for multiple comparisons) sign - Integer. -1 or 1, depending on original sign of correlation to check against following re-evaluation. forward - Boolean. True if CUTIE is run in the forward direction, False if reverse. statistic - String. Describes analysis being performed. fold - Boolean. Determines whether you require the new P value to be a certain fold greater to be classified as a CUTIE. fold_value - Float. Determines fold difference constraint imposed on the resampled p-value needed for a correlation to be classified as a CUTIE. param - String. Either 'r' or 'p' depending on whether r value or p value will be used to filter correlations. OUTPUTS reverse - 1D array. Index i is 1 if the correlation changes sign upon removing sample i. exceeds - 1D array. Index i is 1 if removing that sample causes the correlation to become insignificant in at least 1 different pairwise correlations extrema_p - 1D array. Length n_samp, contains lowest or highest p value observed thusfar for a particular sample, depending if reverse or forward CUTIE was run respectively across all i in {1,...,k} iterations of CUTIE_k. extrema_r - 1D array. Same as extrema_p but for R / correlation strength values. """ # initialize indicators and variables exceeds, reverse, extrema_p, extrema_r, var1, var2 = utils.init_var_indicators( var1_index, var2_index, samp_var1, samp_var2, forward) # iteratively delete k samples and recompute statistics combs = [ list(x) for x in itertools.combinations(range(n_samp), resample_k) ] for indices in combs: new_var1 = var1[~np.in1d(range(len(var1)), indices)] new_var2 = var2[~np.in1d(range(len(var2)), indices)] # remove NaNs new_var1, new_var2 = utils.remove_nans(new_var1, new_var2) # compute new p_value and r_value depending on statistic if statistic in ('pearson', 'rpearson'): p_value, r_value = compute_pc(new_var1, new_var2) elif statistic in ('spearman', 'rspearman'): p_value, r_value = compute_sc(new_var1, new_var2) elif statistic in ('kendall', 'rkendall'): p_value, r_value = compute_kc(new_var1, new_var2) # update reverse, maxp, and minr reverse, extrema_p, extrema_r = update_rev_extrema_rp( sign, r_value, p_value, indices, reverse, extrema_p, extrema_r, forward) # check sign reversal if np.sign(r_value) != sign: for i in indices: reverse[i] += 1 if forward is True: if param == 'p': # fold change p-value restraint if fold: if (p_value > threshold and p_value > pvalues[var1_index][var2_index] * fold_value) or \ np.isnan(p_value): for i in indices: exceeds[i] += 1 elif p_value > threshold or np.isnan(p_value): for i in indices: exceeds[i] += 1 elif param == 'r': # fold change r-value restraint if fold: if (np.abs(r_value) < threshold and np.abs(r_value) < np.abs(corrs[var1_index][var2_index]) * fold_value) or \ np.isnan(r_value): for i in indices: exceeds[i] += 1 elif np.abs(r_value) < threshold or np.isnan(r_value): for i in indices: exceeds[i] += 1 elif forward is False: if param == 'p': # fold change p-value restraint if fold: if (p_value < threshold and p_value < pvalues[var1_index][var2_index] / fold_value): for i in indices: exceeds[i] += 1 elif p_value < threshold: for i in indices: exceeds[i] += 1 elif param == 'r': # fold change p-value restraint if fold: if (np.abs(r_value) > threshold and np.abs(r_value) > np.abs(corrs[var1_index][var2_index]) * fold_value) or \ np.isnan(r_value): for i in indices: exceeds[i] += 1 elif np.abs(r_value) > threshold or np.isnan(r_value): for i in indices: exceeds[i] += 1 return reverse, exceeds, extrema_p, extrema_r
def resample1_cutie_pc(var1_index, var2_index, samp_var1, samp_var2, **kwargs): """ Takes a given var1 and var2 by indices and recomputes Pearson correlation by removing 1 out of n (sample_size) points from samp_ids. (UT in test_pointwise_metrics) ---------------------------------------------------------------------------- INPUTS var1_index - Integer. Index for variable from file 1 in pairwise correlation. var2_index - Integer. Index for variable from file 2 in pairwise correlation. samp_var1 - 2D array. Each value in row i col j is the level of variable j corresponding to sample i in the order that the samples are presented in samp_ids. samp_var2 - 2D array. Same as samp_var1 but for file 2. **kwargs: threshold - Float. Level of significance testing (after adjusting for multiple comparisons). fold - Boolean. Determines whether you require the new P value to be a certain fold greater to be classified as a CUTIE. fold_value - Float. Determines fold difference constraint imposed on the resampled p-value needed for a correlation to be classified as a CUTIE. param - String. Either 'r' or 'p' depending on whether r value or p value will be used to filter correlations. OUTPUTS reverse - 1D array. Index i is 1 if the correlation changes sign upon removing sample i. exceeds - 1D array. Index i is 1 if removing that sample causes the correlation to become insignificant in at least 1 different pairwise correlations. corrs - 1D array. Contains values of correlation strength with sample i removed. p_values - 1D array. Contains values of pvalues with sample i removed. """ n_samp = samp_var1.shape[0] exceeds, reverse, maxp, minr, var1, var2 = \ utils.init_var_indicators(var1_index, var2_index, samp_var1, samp_var2, True) corrs = np.zeros(n_samp) p_values = np.zeros(n_samp) # iteratively delete one sample and recompute statistics original_r, original_p = compute_pc(var1, var2) for s in range(n_samp): new_var1 = var1[~np.in1d(range(n_samp), s)] new_var2 = var2[~np.in1d(range(n_samp), s)] # compute new p_value and r_value r_value, p_value = compute_pc(new_var1, new_var2) # update reverse, maxp, and minr # sign is artificially 0 since we are not interested in that # Forward is True since we only apply Cook's D to TP/FP separation reverse, maxp, minr = update_rev_extrema_rp(0, r_value, p_value, [s], reverse, maxp, minr, True) if kwargs['param'] == 'p': if kwargs['fold']: if (p_value > kwargs['threshold'] and \ p_value > original_p * kwargs['fold_value']) or \ np.isnan(p_value): exceeds[s] += 1 elif p_value > kwargs['threshold'] or np.isnan(p_value): exceeds[s] += 1 elif kwargs['param'] == 'r': if kwargs['fold']: if (np.abs(r_value) < kwargs['threshold'] and \ np.abs(r_value) < np.abs(original_r) * kwargs['fold_value']) or \ np.isnan(r_value): exceeds[s] += 1 elif np.abs(r_value) < kwargs['threshold'] or np.isnan(r_value): exceeds[s] += 1 corrs[s] = r_value p_values[s] = p_value return reverse, exceeds, corrs, p_values