Exemplo n.º 1
0
    def test_multinomial_elementwise_distribution(self):
        '''Verify that the created variables approach a multinomial distribution for large numbers
        of samples.'''
        (m, n, k) = (6, 5, 1)
        r = 2 ** np.arange(4, 17)
        p = statutil.random_row_stochastic((m, n))
        #p = statutil.scale_row_sums(np.ones((m, n)))
        error = np.zeros((len(r),))
        for (i, r_val) in enumerate(r):
            for _ in xrange(k):
                x = statutil.multinomial_elementwise(p, r_val)
                # Root-mean-square-error of observed frequencies w.r.t. desired frequencies
                error[i] += statutil.norm_frobenius_scaled(statutil.hist(x, n) / (1.0 * r_val) - p)
            error[i] /= (1.0 * k)
        # Validate the model error of the central limit theorem: C*r^(-0.5).
        # This is a consequence of the Central Limit Theorem. We are making k experiments for
        # each value of n. Even if k=1, there's a 95% chance that we are within ~1.6 standard deviations
        # from the mean of the normal distribution sqrt(n)*[observed freq variable - p[i,j]] for each
        # entry j of a row i of the matrix p. So if row i's stddev is s[i], the sum of squared errors
        # should be (with 95% confidence) <= n * (1.96*s[i])^2. So 
        # C <= sqrt(sum(n * (1.5*s[i])^2)_i / (m*n)) = 1.96 * sqrt(s[i]^2/m).
        # See http://en.wikipedia.org/wiki/Central_limit_theorem
        alpha, c, r_value, _, _ = linregress(np.log(r), np.log(error))
        c = np.exp(c)
#        print c , 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - 
#                                                          np.sum(p * np.arange(p.shape[1]), axis=1) ** 2,
#                                                          2) / np.sqrt(p.shape[0]),
        assert_almost_equal(alpha, -0.5, decimal=1, err_msg='Unexpected error term growth power')
        self.assertTrue(c <= 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - 
                                                          np.sum(p * np.arange(p.shape[1]), axis=1) ** 2,
                                                          2) / np.sqrt(p.shape[0]),
                        'Error term coefficient outside 95% confidence interval')
        self.assertTrue(abs(r_value) > 0.99, 'Error does not fit a power law in sample size')
Exemplo n.º 2
0
def __handle_estimate_genotype_frequencies(self, request):
    """Estimate genotype frequencies from the genotype data and save them in ProblemInfo."""
    # Load problem fields
    problem = request.problem
    snp_metadata = problem.info.snp
    snp_count = snp_metadata["count"]

    # Recode genotypes to a single number
    r = recode.recode_single_genotype(problem.genotype.data)

    # Count genotype appearances for each SNP, and save in SNP annotation array.
    # The frequency table column order matches the GENOTYPE_CODE array. This includes filled
    # and missing genotypes: (1,1),(1,2),(2,2),(0,0).
    for col, genotype_code in enumerate(recode.GENOTYPE_CODE.itervalues()):
        snp_count[:, col] = statutil.hist(np.where(r == genotype_code)[0], problem.num_snps)

    # Calculate frequencies
    snp_metadata["frequency"] = statutil.scale_row_sums(snp_count.astype("float"))

    return False
Exemplo n.º 3
0
 def test_multinomial_elementwise_distribution(self):
     '''Verify that the created variables approach a multinomial distribution for large numbers
     of samples.'''
     (m, n, k) = (6, 5, 1)
     r = 2**np.arange(4, 17)
     p = statutil.random_row_stochastic((m, n))
     #p = statutil.scale_row_sums(np.ones((m, n)))
     error = np.zeros((len(r), ))
     for (i, r_val) in enumerate(r):
         for _ in xrange(k):
             x = statutil.multinomial_elementwise(p, r_val)
             # Root-mean-square-error of observed frequencies w.r.t. desired frequencies
             error[i] += statutil.norm_frobenius_scaled(
                 statutil.hist(x, n) / (1.0 * r_val) - p)
         error[i] /= (1.0 * k)
     # Validate the model error of the central limit theorem: C*r^(-0.5).
     # This is a consequence of the Central Limit Theorem. We are making k experiments for
     # each value of n. Even if k=1, there's a 95% chance that we are within ~1.6 standard deviations
     # from the mean of the normal distribution sqrt(n)*[observed freq variable - p[i,j]] for each
     # entry j of a row i of the matrix p. So if row i's stddev is s[i], the sum of squared errors
     # should be (with 95% confidence) <= n * (1.96*s[i])^2. So
     # C <= sqrt(sum(n * (1.5*s[i])^2)_i / (m*n)) = 1.96 * sqrt(s[i]^2/m).
     # See http://en.wikipedia.org/wiki/Central_limit_theorem
     alpha, c, r_value, _, _ = linregress(np.log(r), np.log(error))
     c = np.exp(c)
     #        print c , 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) -
     #                                                          np.sum(p * np.arange(p.shape[1]), axis=1) ** 2,
     #                                                          2) / np.sqrt(p.shape[0]),
     assert_almost_equal(alpha,
                         -0.5,
                         decimal=1,
                         err_msg='Unexpected error term growth power')
     self.assertTrue(
         c <= 1.96 * np.linalg.linalg.norm(
             np.sum(p * np.arange(p.shape[1])**2, axis=1) -
             np.sum(p * np.arange(p.shape[1]), axis=1)**2, 2) /
         np.sqrt(p.shape[0]),
         'Error term coefficient outside 95% confidence interval')
     self.assertTrue(
         abs(r_value) > 0.99,
         'Error does not fit a power law in sample size')
Exemplo n.º 4
0
def __handle_estimate_genotype_frequencies(self, request):
    '''Estimate genotype frequencies from the genotype data and save them in ProblemInfo.'''
    # Load problem fields
    problem = request.problem
    snp_metadata = problem.info.snp
    snp_count = snp_metadata['count']

    # Recode genotypes to a single number
    r = recode.recode_single_genotype(problem.genotype.data)

    # Count genotype appearances for each SNP, and save in SNP annotation array.
    # The frequency table column order matches the GENOTYPE_CODE array. This includes filled
    # and missing genotypes: (1,1),(1,2),(2,2),(0,0).
    for col, genotype_code in enumerate(recode.GENOTYPE_CODE.itervalues()):
        snp_count[:, col] = statutil.hist(
            np.where(r == genotype_code)[0], problem.num_snps)

    # Calculate frequencies
    snp_metadata['frequency'] = statutil.scale_row_sums(
        snp_count.astype('float'))

    return False