Пример #1
0
 def test_kernel_empty(self):
     '''
     test trying to run kernel with an empty array
     '''
     counts = np.array([], dtype=np.float)
     size_factors = np.array([], dtype=np.float)
     membership = np.empty((0, ), dtype=INT_DTYPE)
     rng = kernel.RandomState()
     k = kernel.ssea_kernel(counts,
                            size_factors,
                            membership,
                            rng,
                            resample_counts=True,
                            permute_samples=True,
                            add_noise=True,
                            noise_loc=1.0,
                            noise_scale=1.0,
                            method_miss=3,
                            method_hit=3,
                            method_param=1.0)
     (ranks, norm_counts, norm_counts_miss, norm_counts_hit, es_val,
      es_rank, es_run) = k
     self.assertTrue(len(ranks) == 0)
     self.assertTrue(len(norm_counts) == 0)
     self.assertTrue(len(norm_counts_miss) == 0)
     self.assertTrue(len(norm_counts_hit) == 0)
     self.assertTrue(es_val == 0)
     self.assertTrue(es_rank == 0)
     self.assertTrue(es_run.shape == (0, ))
Пример #2
0
 def test_kernel_empty(self):        
     '''
     test trying to run kernel with an empty array
     '''
     counts = np.array([], dtype=np.float)
     size_factors = np.array([], dtype=np.float)
     membership = np.empty((0,), dtype=INT_DTYPE)
     rng = kernel.RandomState()
     k = kernel.ssea_kernel(counts, size_factors, membership, rng,
                            resample_counts=True,
                            permute_samples=True,
                            add_noise=True,
                            noise_loc=1.0,
                            noise_scale=1.0,
                            method_miss=3,
                            method_hit=3,
                            method_param=1.0)
     (ranks, norm_counts, norm_counts_miss, norm_counts_hit, 
     es_val, es_rank, es_run) = k
     self.assertTrue(len(ranks) == 0)
     self.assertTrue(len(norm_counts) == 0)
     self.assertTrue(len(norm_counts_miss) == 0)
     self.assertTrue(len(norm_counts_hit) == 0)
     self.assertTrue(es_val == 0)
     self.assertTrue(es_rank == 0)
     self.assertTrue(es_run.shape == (0,))
Пример #3
0
def ssea_run(counts, size_factors, membership, rng, config):
    '''
    counts: numpy array of float values
    size_factors: normalization factors for counts
    membership: int array (0 or 1) with set membership
    rng: RandomState object
    config: Config object
    '''
    # run kernel to generate a range of observed enrichment scores
    resample_rand_seeds = np.empty(config.resampling_iterations, dtype=np.int)
    resample_count_ranks = np.empty(
        (config.resampling_iterations, counts.shape[0]), dtype=np.int)
    resample_es_vals = np.zeros(config.resampling_iterations, dtype=np.float)
    resample_es_ranks = np.zeros(config.resampling_iterations, dtype=np.int)
    for i in xrange(config.resampling_iterations):
        # save random number generator seed before running kernel
        resample_rand_seeds[i] = rng.seed
        k = ssea_kernel(counts,
                        size_factors,
                        membership,
                        rng,
                        resample_counts=True,
                        permute_samples=False,
                        add_noise=True,
                        noise_loc=config.noise_loc,
                        noise_scale=config.noise_scale,
                        method_miss=config.weight_miss,
                        method_hit=config.weight_hit,
                        method_param=config.weight_param)
        k = KernelResult._make(k)
        resample_count_ranks[i] = k.ranks
        resample_es_vals[i] = k.es_val
        resample_es_ranks[i] = k.es_rank
    # find the median ES value
    median_index = int(config.resampling_iterations / 2)
    median_index = resample_es_vals.argsort()[median_index]
    median_es_val = resample_es_vals[median_index]
    # choose whether to use the positive or negative side of the
    # distribution based on the median ES value
    if median_es_val == 0:
        return Result.default(), np.zeros((0, ), dtype=np.float)
    elif median_es_val < 0:
        signfunc = np.less
    else:
        signfunc = np.greater
    # subset to include only the corresponding side of the distribution
    resample_sign_inds = signfunc(resample_es_vals, 0)
    resample_rand_seeds = resample_rand_seeds[resample_sign_inds]
    resample_count_ranks = resample_count_ranks[resample_sign_inds]
    resample_es_vals = resample_es_vals[resample_sign_inds]
    resample_es_ranks = resample_es_ranks[resample_sign_inds]
    # determine the median value
    median_index = int(resample_es_vals.shape[0] / 2)
    median_index = resample_es_vals.argsort()[median_index]
    # select the kernel run representing the median ES value
    rand_seed = resample_rand_seeds[median_index]
    es_val = resample_es_vals[median_index]
    es_rank = resample_es_ranks[median_index]
    ranks = resample_count_ranks[median_index]
    es_sign = cmp(es_val, 0)
    # permute samples and determine ES null distribution
    null_es_vals = np.zeros(config.perms, dtype=np.float)
    null_es_ranks = np.zeros(config.perms, dtype=np.float)
    i = 0
    while i < config.perms:
        k = ssea_kernel(counts,
                        size_factors,
                        membership,
                        rng,
                        resample_counts=True,
                        permute_samples=True,
                        add_noise=True,
                        noise_loc=config.noise_loc,
                        noise_scale=config.noise_scale,
                        method_miss=config.weight_miss,
                        method_hit=config.weight_hit,
                        method_param=config.weight_param)
        k = KernelResult._make(k)
        if cmp(k.es_val, 0) == es_sign:
            null_es_vals[i] = k.es_val
            null_es_ranks[i] = k.es_rank
            i += 1
    # Subset the null ES scores to only positive or negative
    null_sign_inds = signfunc(null_es_vals, 0)
    null_es_vals = null_es_vals[null_sign_inds]
    null_es_ranks = null_es_ranks[null_sign_inds]
    # Adjust for variation in gene set size. Normalize ES(S,null)
    # and the observed ES(S), separately rescaling the positive and
    # negative scores by dividing by the mean of the ES(S,null) to
    # yield normalized scores NES(S,null)
    null_es_mean = np.fabs(null_es_vals.mean())
    null_nes_vals = null_es_vals / null_es_mean
    # Normalize the observed ES(S) by rescaling by the mean of
    # the ES(S,null) separately for positive and negative ES(S)
    nes_val = es_val / null_es_mean
    # estimate nominal p value for S from ES(S,null) by using the
    # positive or negative portion of the distribution corresponding
    # to the sign of the observed ES(S)
    p_value = (np.fabs(null_es_vals) >= np.fabs(es_val)).sum().astype(np.float)
    p_value /= null_es_vals.shape[0]
    # Create result object for this SSEA test
    res = Result()
    res.rand_seed = int(rand_seed)
    res.es = round(es_val, FLOAT_PRECISION)
    res.es_rank = int(es_rank)
    res.nominal_p_value = round(p_value, SCIENTIFIC_NOTATION_PRECISION)
    res.nes = round(nes_val, FLOAT_PRECISION)
    # save some of the resampled es points
    res.resample_es_vals = np.around(resample_es_vals[:Result.MAX_POINTS],
                                     FLOAT_PRECISION)
    res.resample_es_ranks = resample_es_ranks[:Result.MAX_POINTS]
    # save null distribution points
    res.null_es_mean = null_es_mean
    res.null_es_vals = np.around(null_es_vals[:Result.MAX_POINTS],
                                 FLOAT_PRECISION)
    res.null_es_ranks = null_es_ranks[:Result.MAX_POINTS]
    # get indexes of hits in this set
    m = membership[ranks]
    hit_inds = (m > 0).nonzero()[0]
    num_hits = hit_inds.shape[0]
    num_misses = m.shape[0] - num_hits
    # calculate leading edge stats
    if es_val < 0:
        core_hits = sum(i >= es_rank for i in hit_inds)
        core_misses = (m.shape[0] - es_rank) - core_hits
    else:
        core_hits = sum(i <= es_rank for i in hit_inds)
        core_misses = 1 + es_rank - core_hits
    null_hits = num_hits - core_hits
    null_misses = num_misses - core_misses
    # fisher exact test (one-sided hypothesis that LE is enricheD)
    fisher_p_value = fisher.pvalue(core_hits, core_misses, null_hits,
                                   null_misses).right_tail
    # odds ratio
    n = np.inf if null_hits == 0 else float(core_hits) / null_hits
    d = np.inf if null_misses == 0 else float(core_misses) / null_misses
    if np.isfinite(n) and np.isfinite(d):
        if n == 0 and d == 0:
            odds_ratio = np.nan
        else:
            odds_ratio = np.inf if d == 0 else (n / d)
    elif np.isfinite(d):
        odds_ratio = np.inf
    else:
        odds_ratio = np.nan if n == 0 else 0.0
    # create dictionary result
    res.core_hits = int(core_hits)
    res.core_misses = int(core_misses)
    res.null_hits = int(null_hits)
    res.null_misses = int(null_misses)
    res.fisher_p_value = np.round(fisher_p_value,
                                  SCIENTIFIC_NOTATION_PRECISION)
    res.odds_ratio = odds_ratio
    # return result and null distribution for subsequent fdr calculations
    return res, null_nes_vals
Пример #4
0
def ssea_run(counts, size_factors, membership, rng, config):
    '''
    counts: numpy array of float values
    size_factors: normalization factors for counts
    membership: int array (0 or 1) with set membership
    rng: RandomState object
    config: Config object
    '''    
    # run kernel to generate a range of observed enrichment scores
    resample_rand_seeds = np.empty(config.resampling_iterations, dtype=np.int)
    resample_count_ranks = np.empty((config.resampling_iterations, counts.shape[0]), dtype=np.int)
    resample_es_vals = np.zeros(config.resampling_iterations, dtype=np.float) 
    resample_es_ranks = np.zeros(config.resampling_iterations, dtype=np.int)
    for i in xrange(config.resampling_iterations):
        # save random number generator seed before running kernel
        resample_rand_seeds[i] = rng.seed
        k = ssea_kernel(counts, size_factors, membership, rng,
                        resample_counts=True,
                        permute_samples=False,
                        add_noise=True,
                        noise_loc=config.noise_loc, 
                        noise_scale=config.noise_scale,
                        method_miss=config.weight_miss,
                        method_hit=config.weight_hit,
                        method_param=config.weight_param)
        k = KernelResult._make(k)
        resample_count_ranks[i] = k.ranks
        resample_es_vals[i] = k.es_val
        resample_es_ranks[i] = k.es_rank
    # find the median ES value
    median_index = int(config.resampling_iterations / 2)
    median_index = resample_es_vals.argsort()[median_index]
    median_es_val = resample_es_vals[median_index]
    # choose whether to use the positive or negative side of the 
    # distribution based on the median ES value
    if median_es_val == 0:
        return Result.default(), np.zeros((0,), dtype=np.float)
    elif median_es_val < 0:
        signfunc = np.less
    else:
        signfunc = np.greater
    # subset to include only the corresponding side of the distribution
    resample_sign_inds = signfunc(resample_es_vals, 0)
    resample_rand_seeds = resample_rand_seeds[resample_sign_inds]
    resample_count_ranks = resample_count_ranks[resample_sign_inds]
    resample_es_vals = resample_es_vals[resample_sign_inds]
    resample_es_ranks = resample_es_ranks[resample_sign_inds]
    # determine the median value
    median_index = int(resample_es_vals.shape[0] / 2)
    median_index = resample_es_vals.argsort()[median_index]
    # select the kernel run representing the median ES value
    rand_seed = resample_rand_seeds[median_index]
    es_val = resample_es_vals[median_index]
    es_rank = resample_es_ranks[median_index]
    ranks = resample_count_ranks[median_index]
    es_sign = cmp(es_val, 0)
    # permute samples and determine ES null distribution
    null_es_vals = np.zeros(config.perms, dtype=np.float) 
    null_es_ranks = np.zeros(config.perms, dtype=np.float)
    i = 0
    while i < config.perms:
        k = ssea_kernel(counts, size_factors, membership, rng,
                        resample_counts=True,
                        permute_samples=True,
                        add_noise=True,
                        noise_loc=config.noise_loc, 
                        noise_scale=config.noise_scale,
                        method_miss=config.weight_miss,
                        method_hit=config.weight_hit,
                        method_param=config.weight_param)
        k = KernelResult._make(k)
        if cmp(k.es_val, 0) == es_sign:
            null_es_vals[i] = k.es_val
            null_es_ranks[i] = k.es_rank
            i += 1
    # Subset the null ES scores to only positive or negative 
    null_sign_inds = signfunc(null_es_vals, 0)
    null_es_vals = null_es_vals[null_sign_inds]
    null_es_ranks = null_es_ranks[null_sign_inds]
    # Adjust for variation in gene set size. Normalize ES(S,null)
    # and the observed ES(S), separately rescaling the positive and
    # negative scores by dividing by the mean of the ES(S,null) to
    # yield normalized scores NES(S,null)
    null_es_mean = np.fabs(null_es_vals.mean())
    null_nes_vals = null_es_vals / null_es_mean
    # Normalize the observed ES(S) by rescaling by the mean of
    # the ES(S,null) separately for positive and negative ES(S)
    nes_val = es_val / null_es_mean
    # estimate nominal p value for S from ES(S,null) by using the
    # positive or negative portion of the distribution corresponding
    # to the sign of the observed ES(S)
    p_value = (np.fabs(null_es_vals) >= np.fabs(es_val)).sum().astype(np.float)
    p_value /= null_es_vals.shape[0]
    # Create result object for this SSEA test
    res = Result()
    res.rand_seed = int(rand_seed)
    res.es = round(es_val, FLOAT_PRECISION)
    res.es_rank = int(es_rank)
    res.nominal_p_value = round(p_value, SCIENTIFIC_NOTATION_PRECISION)
    res.nes = round(nes_val, FLOAT_PRECISION)
    # save some of the resampled es points 
    res.resample_es_vals = np.around(resample_es_vals[:Result.MAX_POINTS], FLOAT_PRECISION)
    res.resample_es_ranks = resample_es_ranks[:Result.MAX_POINTS]
    # save null distribution points
    res.null_es_mean = null_es_mean
    res.null_es_vals = np.around(null_es_vals[:Result.MAX_POINTS], FLOAT_PRECISION)
    res.null_es_ranks = null_es_ranks[:Result.MAX_POINTS]
    # get indexes of hits in this set
    m = membership[ranks]
    hit_inds = (m > 0).nonzero()[0]
    num_hits = hit_inds.shape[0]
    num_misses = m.shape[0] - num_hits
    # calculate leading edge stats
    if es_val < 0:
        core_hits = sum(i >= es_rank for i in hit_inds)
        core_misses = (m.shape[0] - es_rank) - core_hits
    else:
        core_hits = sum(i <= es_rank for i in hit_inds)
        core_misses = 1 + es_rank - core_hits
    null_hits = num_hits - core_hits
    null_misses = num_misses - core_misses
    # fisher exact test (one-sided hypothesis that LE is enricheD)
    fisher_p_value = fisher.pvalue(core_hits, core_misses, null_hits, null_misses).right_tail
    # odds ratio
    n = np.inf if null_hits == 0 else float(core_hits) / null_hits
    d = np.inf if null_misses == 0 else float(core_misses) / null_misses
    if np.isfinite(n) and np.isfinite(d):
        if n == 0 and d == 0:
            odds_ratio = np.nan
        else:
            odds_ratio = np.inf if d == 0 else (n / d)
    elif np.isfinite(d):
        odds_ratio = np.inf
    else:
        odds_ratio = np.nan if n == 0 else 0.0
    # create dictionary result
    res.core_hits = int(core_hits)
    res.core_misses = int(core_misses)
    res.null_hits = int(null_hits)
    res.null_misses = int(null_misses)
    res.fisher_p_value = np.round(fisher_p_value, SCIENTIFIC_NOTATION_PRECISION)
    res.odds_ratio = odds_ratio
    # return result and null distribution for subsequent fdr calculations
    return res, null_nes_vals