示例#1
0
    def test_remove_nans(self):
        # Check Empty
        var1 = []
        var2 = []
        nvar1, nvar2 = utils.remove_nans(var1, var2)
        assert not nvar1.any()
        assert not nvar2.any()

        # Check some nan
        var1 = [1, np.nan, np.nan, 4, 5, 6, np.nan]
        var2 = [np.nan, 2, np.nan, 4, 5, np.nan, 7]

        nvar1, nvar2 = utils.remove_nans(var1, var2)
        assert (nvar1 == np.array([4, 5])).all()
        assert (nvar2 == np.array([4, 5])).all()

        # Check all nan
        var1 = [np.nan, np.nan, np.nan, np.nan]
        var2 = [np.nan, np.nan, np.nan, 7]

        nvar1, nvar2 = utils.remove_nans(var1, var2)
        assert not nvar1.any()
        assert not nvar2.any()

        # Check no nan
        var1 = [i for i in range(10)]
        var2 = [i * i for i in range(10)]

        nvar1, nvar2 = utils.remove_nans(var1, var2)
        assert (nvar1 == np.array(var1)).all()
        assert (nvar2 == np.array(var2)).all()
示例#2
0
def calculate_FP_sets(initial_corr, samp_var1, samp_var2, infln_metrics,
                      infln_mapping, threshold, fold, fold_value, param):
    """
    Determine which correlations (variable pairs) belong in which
    infln_metric_FP sets.
    ----------------------------------------------------------------------------
    INPUTS
    initial_corr  - Set of integer tuples. Contains variable pairs initially
                    classified as significant (forward CUTIE) or insignificant
                    (reverse CUTIE). Note variable pairs (i,j) and (j,i) are
                    double counted.
    infln_metrics - List. Contains strings of infln_metrics (such as 'cookd').
    infln_mapping - Dictionary. Maps strings of function names to function
                    objects (e.g. 'cookd')
    samp_var1     - 2D array. Each value in row i col j is the level of
                    variable j corresponding to sample i in the order that the
                    samples are presented in samp_ids.
    samp_var2     - 2D array. Same as samp_var1 but for file 2.
    threshold     - Float. Level of significance testing (after adjusting for
                    multiple comparisons)
    fold          - Boolean. Determines whether you require the new P value to
                    be a certain fold greater to be classified as a CUTIE.
    fold_value    - Float. Determines fold difference constraint imposed on the
                    resampled p-value needed for a correlation to be classified as
                    a CUTIE.
    param        - String. Either 'r' or 'p' depending on whether r value or p
                   value will be used to filter correlations.

    OUTPUTS
    FP_infln_sets - Dictionary. Key is particular outlier metric, entry is a set
                    of variable pairs classified as FP according to that metric.
    """
    FP_infln_sets = {}

    # initialize dict
    for metric in infln_metrics:
        FP_infln_sets[metric] = set()

    # determine if each initial_corr correlation belongs in each metric FP set
    for pair in initial_corr:
        var1, var2 = pair

        x_old = samp_var1[:, var1]
        y_old = samp_var2[:, var2]

        # remove nan for influence calculation
        var1_values, var2_values = utils.remove_nans(x_old, y_old)
        if len(var1_values) > 1 and len(var2_values) > 1:
            influence = return_influence(var1_values, var2_values)
            for metric in infln_metrics:
                reverse, exceeds, corr_values, pvalues_thresholds = infln_mapping[metric](
                    var1, var2, samp_var1, samp_var2, influence=influence,
                    threshold=threshold, fold=fold, fold_value=fold_value,
                    param=param)

                # if exceeds == 0 then it is a TP
                if exceeds.sum() != 0:
                    FP_infln_sets[metric].add(pair)

    return FP_infln_sets
示例#3
0
    def test_pointwise_metrics(self):
        # generate results and output intermediate file
        pointwise_results = {}
        for t in self.tuples:
            t1, t2, = t
            pointwise_results[str(t)] = {}

            for p in ['p', 'r']:
                pointwise_results[str(t)][p] = {}

                for f in self.infln_mapping::
                    x_old = self.samp_var1[:, t1]
                    y_old = self.samp_var2[:, t2]

                    var1_values, var2_values = utils.remove_nans(x_old, y_old)
                    influence = statistics.return_influence(var1_values, var2_values)

                    arr_0, arr_1, arr_2, arr_3 = self.infln_mapping[f](var1_index=t1,
                        var2_index=t2, samp_var1=self.samp_var1, samp_var2=self.samp_var2,
                        influence=influence, threshold=self.threshold[p], fold=self.fold,
                        fold_value=self.fold_value[p], param=p)

                    # save results to compressed object and later text file
                    fp = self.work_dir + '_'.join([str(t), p, f, '.npz'])
                    np.savez(fp, arr_0, arr_1, arr_2, arr_3)
                    results = []
                    for key, value in np.load(fp).items():
                        results.append(value)
                        np.savetxt(self.work_dir + '_'.join([str(t), p, f, key + '.txt']), value)

                    pointwise_results[str(t)][p][f] = results

        # test cutie, cookd, dffits, dsr
        # with inputs mixed with nan and neg values as defined in setUp
        for t in self.tuples:
            t1, t2 = t
            var1_values = self.samp_var1[:, t1]
            var2_values = self.samp_var2[:, t2]
            influence = statistics.return_influence(var1_values, var2_values)

            for p in ['p', 'r']:
                for f in self.infln_mapping:
                    results = self.infln_mapping[f](var1_index=t1,
                        var2_index=t2, samp_var1=self.samp_var1, samp_var2=self.samp_var2,
                        influence=influence,
                        threshold=self.threshold[p], fold=self.fold,
                        fold_value=self.fold_value[p], param=p)
                    # comparison to 7 decimal places is the default value
                    assert_almost_equal(pointwise_results[str(t)][p][f], results)
示例#4
0
def compute_kc(new_var1, new_var2):
    """
    Compute Kendall correlation and return p and r values.
    ----------------------------------------------------------------------------
    INPUTS
    new_var1 - Array. Length sample size containing observations for given
               variable from file 1.
    new_var2 - Array. Same as new_var1 but for file 2.
    """
    var1, var2 = utils.remove_nans(new_var1, new_var2)
    try:
        r_value, p_value = scipy.stats.kendalltau(var1, var2)
    except ValueError:
        r_value, p_value = np.nan, np.nan

    return p_value, r_value
示例#5
0
def initial_stats(samp_var1, samp_var2, corr_func, paired):
    """
    Helper function for assign_statistics. Forks between desired correlation
    coefficient (Pearson, Spearman, Kendall and MINE). Computes an initial
    set of statistics per the specified functions. Returns a dict where the key
    is a statistical function and the element is an initial matrix with
    dimensions n_rel_stats x n_var1 x n_var2, corresponding to the relevant
    statistic between each var1 and var2.
    ----------------------------------------------------------------------------
    INPUTS
    samp_var1  - 2D array. Each value in row i col j is the level of variable j
                 corresponding to sample i in the order that the samples are
                 presented in samp_ids.
    samp_var2  - 2D array. Same as samp_var1 but for file 2.
    corr_func  - Function. Desired function for computing correlation (e.g.
                 scipy.stats.pearsonr, scipy.stats.spearmanr,
                 scipy.stats.kendalltau).
    paired     - Boolean. True if variables are paired.

    OUTPUTS
    stat_array - 3D array. Depth k = 2, row i, col j corresponds to the value of
                 that quantity k (correlation or pvalue) for the correlation
                 between var i and var j.
    """
    n_var1, n_var2, n_samp = utils.get_param(samp_var1, samp_var2)

    corrs = np.zeros([n_var1, n_var2])
    pvalues = np.zeros([n_var1, n_var2])

    # subset the data matrices into the cols needed
    for var1 in range(n_var1):
        for var2 in range(n_var2):
            if not (paired and (var1 <= var2)):
                var1_values, var2_values = utils.remove_nans(samp_var1[:, var1],
                                                             samp_var2[:, var2])

                try:
                    corrs[var1][var2], pvalues[var1][var2] = corr_func(var1_values,
                                                                       var2_values)
                except ValueError:
                    corrs[var1][var2], pvalues[var1][var2] = np.nan, np.nan

    return corrs, pvalues
示例#6
0
def resamplek_cutie(var1_index, var2_index, n_samp, samp_var1, samp_var2,
                    pvalues, corrs, threshold, resample_k, sign, forward,
                    statistic, fold, fold_value, param):
    """
    Perform CUTIE resampling on a given pair of variables and test CUTIE status.
    ----------------------------------------------------------------------------
    INPUTS
    var1_index        - Integer. Index of variable in file 1.
    var2_index        - Integer. Index of variable in file 2.
    n_samp            - Integer. Number of samples.
    samp_var1         - 2D array. Each value in row i col j is the level of
                        variable j corresponding to sample i in the order that
                        the samples are presented in samp_ids when parsed.
    samp_var2         - 2D array. Same as samp_var1 but for file 2.
    pvalues           - 2D array. Entry row i, col j represents p value of
                        correlation between i-th var1 and j-th var2.
    corrs             - 2D array. Contains values of correlation strength
                        between var i and var j.
    threshold         - Float. Level of significance testing (after adjusting
                        for multiple comparisons)
    sign              - Integer. -1 or 1, depending on original sign of
                        correlation to check against following re-evaluation.
    forward           - Boolean. True if CUTIE is run in the forward direction,
                        False if reverse.
    statistic         - String. Describes analysis being performed.
    fold              - Boolean. Determines whether you require the new P value
                        to be a certain fold greater to be classified as a CUTIE.
    fold_value        - Float. Determines fold difference constraint imposed on
                        the resampled p-value needed for a correlation to be
                        classified as a CUTIE.
    param             - String. Either 'r' or 'p' depending on whether r value or p
                        value will be used to filter correlations.
    OUTPUTS
    reverse           - 1D array. Index i is 1 if the correlation changes sign
                        upon removing sample i.
    exceeds           - 1D array. Index i is 1 if removing that sample causes
                        the correlation to become insignificant in at least 1
                        different pairwise correlations
    extrema_p         - 1D array. Length n_samp, contains lowest or highest p
                        value observed thusfar for a particular sample,
                        depending if reverse or forward CUTIE was run
                        respectively across all i in {1,...,k} iterations of
                        CUTIE_k.
    extrema_r         - 1D array. Same as extrema_p but for R / correlation
                        strength values.
    """
    # initialize indicators and variables
    exceeds, reverse, extrema_p, extrema_r, var1, var2 = utils.init_var_indicators(
        var1_index, var2_index, samp_var1, samp_var2, forward)

    # iteratively delete k samples and recompute statistics
    combs = [
        list(x) for x in itertools.combinations(range(n_samp), resample_k)
    ]
    for indices in combs:
        new_var1 = var1[~np.in1d(range(len(var1)), indices)]
        new_var2 = var2[~np.in1d(range(len(var2)), indices)]

        # remove NaNs
        new_var1, new_var2 = utils.remove_nans(new_var1, new_var2)

        # compute new p_value and r_value depending on statistic
        if statistic in ('pearson', 'rpearson'):
            p_value, r_value = compute_pc(new_var1, new_var2)
        elif statistic in ('spearman', 'rspearman'):
            p_value, r_value = compute_sc(new_var1, new_var2)
        elif statistic in ('kendall', 'rkendall'):
            p_value, r_value = compute_kc(new_var1, new_var2)

        # update reverse, maxp, and minr
        reverse, extrema_p, extrema_r = update_rev_extrema_rp(
            sign, r_value, p_value, indices, reverse, extrema_p, extrema_r,
            forward)

        # check sign reversal
        if np.sign(r_value) != sign:
            for i in indices:
                reverse[i] += 1

        if forward is True:
            if param == 'p':
                # fold change p-value restraint
                if fold:
                    if (p_value > threshold and
                        p_value > pvalues[var1_index][var2_index] * fold_value) or \
                            np.isnan(p_value):
                        for i in indices:
                            exceeds[i] += 1
                elif p_value > threshold or np.isnan(p_value):
                    for i in indices:
                        exceeds[i] += 1
            elif param == 'r':
                # fold change r-value restraint
                if fold:
                    if (np.abs(r_value) < threshold and
                        np.abs(r_value) < np.abs(corrs[var1_index][var2_index]) * fold_value) or \
                            np.isnan(r_value):
                        for i in indices:
                            exceeds[i] += 1
                elif np.abs(r_value) < threshold or np.isnan(r_value):
                    for i in indices:
                        exceeds[i] += 1

        elif forward is False:
            if param == 'p':
                # fold change p-value restraint
                if fold:
                    if (p_value < threshold and p_value <
                            pvalues[var1_index][var2_index] / fold_value):
                        for i in indices:
                            exceeds[i] += 1
                elif p_value < threshold:
                    for i in indices:
                        exceeds[i] += 1
            elif param == 'r':
                # fold change p-value restraint
                if fold:
                    if (np.abs(r_value) > threshold and
                        np.abs(r_value) > np.abs(corrs[var1_index][var2_index]) * fold_value) or \
                            np.isnan(r_value):
                        for i in indices:
                            exceeds[i] += 1
                elif np.abs(r_value) > threshold or np.isnan(r_value):
                    for i in indices:
                        exceeds[i] += 1

    return reverse, exceeds, extrema_p, extrema_r