Пример #1
0
    def test_multinomial_elementwise_distribution(self):
        '''Verify that the created variables approach a multinomial distribution for large numbers
        of samples.'''
        (m, n, k) = (6, 5, 1)
        r = 2 ** np.arange(4, 17)
        p = statutil.random_row_stochastic((m, n))
        #p = statutil.scale_row_sums(np.ones((m, n)))
        error = np.zeros((len(r),))
        for (i, r_val) in enumerate(r):
            for _ in xrange(k):
                x = statutil.multinomial_elementwise(p, r_val)
                # Root-mean-square-error of observed frequencies w.r.t. desired frequencies
                error[i] += statutil.norm_frobenius_scaled(statutil.hist(x, n) / (1.0 * r_val) - p)
            error[i] /= (1.0 * k)
        # Validate the model error of the central limit theorem: C*r^(-0.5).
        # This is a consequence of the Central Limit Theorem. We are making k experiments for
        # each value of n. Even if k=1, there's a 95% chance that we are within ~1.6 standard deviations
        # from the mean of the normal distribution sqrt(n)*[observed freq variable - p[i,j]] for each
        # entry j of a row i of the matrix p. So if row i's stddev is s[i], the sum of squared errors
        # should be (with 95% confidence) <= n * (1.96*s[i])^2. So 
        # C <= sqrt(sum(n * (1.5*s[i])^2)_i / (m*n)) = 1.96 * sqrt(s[i]^2/m).
        # See http://en.wikipedia.org/wiki/Central_limit_theorem
        alpha, c, r_value, _, _ = linregress(np.log(r), np.log(error))
        c = np.exp(c)
#        print c , 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - 
#                                                          np.sum(p * np.arange(p.shape[1]), axis=1) ** 2,
#                                                          2) / np.sqrt(p.shape[0]),
        assert_almost_equal(alpha, -0.5, decimal=1, err_msg='Unexpected error term growth power')
        self.assertTrue(c <= 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - 
                                                          np.sum(p * np.arange(p.shape[1]), axis=1) ** 2,
                                                          2) / np.sqrt(p.shape[0]),
                        'Error term coefficient outside 95% confidence interval')
        self.assertTrue(abs(r_value) > 0.99, 'Error does not fit a power law in sample size')
Пример #2
0
 def test_multinomial_elementwise_vector(self):
     '''Test creating multinomial variables (r=1).'''
     (m, n) = (20, 5)
     p = statutil.random_row_stochastic((m, n))
     x = statutil.multinomial_elementwise(p)
     assert_equal(x, [3, 1, 3, 4, 0, 2, 2, 3, 1, 4, 2, 3, 2, 1, 4, 2, 4, 3, 4, 3],
                  'Wrong random multinomial generated')
Пример #3
0
 def test_multinomial_elementwise_vector(self):
     '''Test creating multinomial variables (r=1).'''
     (m, n) = (20, 5)
     p = statutil.random_row_stochastic((m, n))
     x = statutil.multinomial_elementwise(p)
     assert_equal(
         x, [3, 1, 3, 4, 0, 2, 2, 3, 1, 4, 2, 3, 2, 1, 4, 2, 4, 3, 4, 3],
         'Wrong random multinomial generated')
Пример #4
0
 def test_multinomial_elementwise_matrix(self):
     '''Test creating multinomial variables (r > 1).'''
     (m, n) = (20, 5)
     p = statutil.random_row_stochastic((m, n))
     x = statutil.multinomial_elementwise(p, 2)
     assert_equal(x, [[3, 3], [1, 2], [3, 3], [4, 3], [0, 1], [2, 2], [2, 0], [3, 1], [1, 2],
                      [4, 0], [2, 2], [3, 1], [2, 1], [1, 1], [4, 2], [2, 3], [4, 3], [3, 4],
                      [4, 3], [3, 1]],
                  'Wrong random multinomial generated')
Пример #5
0
 def _simulate_imputation(self):
     '''Simulate imputing i from phased T-haplotypes.'''
     x = IMPUTABLE[statutil.multinomial_elementwise(self.p)]
     a0 = np.any(x / 2)  # Bit 0 set for at least one element of x ==> allele 0 is imputable
     a1 = np.any(x % 2)  # Bit 1 set for at least one element of x ==> allele 1 is imputable
     if a0 & a1:  # Both alleles are imputable
         self.count_full += 1
     if a0 | a1:  # Either allele is imputable
         self.count_part += 1
     self.count += 1
Пример #6
0
 def test_multinomial_elementwise_matrix(self):
     '''Test creating multinomial variables (r > 1).'''
     (m, n) = (20, 5)
     p = statutil.random_row_stochastic((m, n))
     x = statutil.multinomial_elementwise(p, 2)
     assert_equal(x,
                  [[3, 3], [1, 2], [3, 3], [4, 3], [0, 1], [2, 2], [2, 0],
                   [3, 1], [1, 2], [4, 0], [2, 2], [3, 1], [2, 1], [1, 1],
                   [4, 2], [2, 3], [4, 3], [3, 4], [4, 3], [3, 1]],
                  'Wrong random multinomial generated')
Пример #7
0
 def _simulate_imputation(self):
     '''Simulate imputing i from phased T-haplotypes.'''
     x = IMPUTABLE[statutil.multinomial_elementwise(self.p)]
     a0 = np.any(
         x / 2
     )  # Bit 0 set for at least one element of x ==> allele 0 is imputable
     a1 = np.any(
         x % 2
     )  # Bit 1 set for at least one element of x ==> allele 1 is imputable
     if a0 & a1:  # Both alleles are imputable
         self.count_full += 1
     if a0 | a1:  # Either allele is imputable
         self.count_part += 1
     self.count += 1
Пример #8
0
 def test_multinomial_elementwise_distribution(self):
     '''Verify that the created variables approach a multinomial distribution for large numbers
     of samples.'''
     (m, n, k) = (6, 5, 1)
     r = 2**np.arange(4, 17)
     p = statutil.random_row_stochastic((m, n))
     #p = statutil.scale_row_sums(np.ones((m, n)))
     error = np.zeros((len(r), ))
     for (i, r_val) in enumerate(r):
         for _ in xrange(k):
             x = statutil.multinomial_elementwise(p, r_val)
             # Root-mean-square-error of observed frequencies w.r.t. desired frequencies
             error[i] += statutil.norm_frobenius_scaled(
                 statutil.hist(x, n) / (1.0 * r_val) - p)
         error[i] /= (1.0 * k)
     # Validate the model error of the central limit theorem: C*r^(-0.5).
     # This is a consequence of the Central Limit Theorem. We are making k experiments for
     # each value of n. Even if k=1, there's a 95% chance that we are within ~1.6 standard deviations
     # from the mean of the normal distribution sqrt(n)*[observed freq variable - p[i,j]] for each
     # entry j of a row i of the matrix p. So if row i's stddev is s[i], the sum of squared errors
     # should be (with 95% confidence) <= n * (1.96*s[i])^2. So
     # C <= sqrt(sum(n * (1.5*s[i])^2)_i / (m*n)) = 1.96 * sqrt(s[i]^2/m).
     # See http://en.wikipedia.org/wiki/Central_limit_theorem
     alpha, c, r_value, _, _ = linregress(np.log(r), np.log(error))
     c = np.exp(c)
     #        print c , 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) -
     #                                                          np.sum(p * np.arange(p.shape[1]), axis=1) ** 2,
     #                                                          2) / np.sqrt(p.shape[0]),
     assert_almost_equal(alpha,
                         -0.5,
                         decimal=1,
                         err_msg='Unexpected error term growth power')
     self.assertTrue(
         c <= 1.96 * np.linalg.linalg.norm(
             np.sum(p * np.arange(p.shape[1])**2, axis=1) -
             np.sum(p * np.arange(p.shape[1]), axis=1)**2, 2) /
         np.sqrt(p.shape[0]),
         'Error term coefficient outside 95% confidence interval')
     self.assertTrue(
         abs(r_value) > 0.99,
         'Error does not fit a power law in sample size')
Пример #9
0
def __handle_fill_missing_genotypes(self, request):
    '''Fill missing genotype entries by randomly sampling from the multinomial distribution with
    estimated genotype frequencies at the corresponding SNP.'''
    # Load problem fields 
    if request.params.debug:
        print 'Filling missing genotypes from estimated genotype distribution'
    problem = request.problem
    g = problem.genotype.data
    snp_frequency = problem.info.snp['frequency'][:, FILLED_GENOTYPES]

    # Recode genotypes to a single number
    r = recode.recode_single_genotype(g)
    # Find SNP, sample indices of missing data
    missing = recode.where_missing(r)
    
    # Generate random multinomial values; map them to genotype codes
    filled_code = multinomial_elementwise(scale_row_sums(snp_frequency[missing[SNP]])) + 2 
    
    # Fill-in all genotypes of a certain value in a vectorized manner 
    for (genotype, code) in recode.GENOTYPE_CODE.iteritems():
        index = np.where(filled_code == code)[0]
        g[missing[SNP][index], missing[SAMPLE][index], :] = genotype
    return False
Пример #10
0
def __handle_fill_missing_genotypes(self, request):
    '''Fill missing genotype entries by randomly sampling from the multinomial distribution with
    estimated genotype frequencies at the corresponding SNP.'''
    # Load problem fields
    if request.params.debug:
        print 'Filling missing genotypes from estimated genotype distribution'
    problem = request.problem
    g = problem.genotype.data
    snp_frequency = problem.info.snp['frequency'][:, FILLED_GENOTYPES]

    # Recode genotypes to a single number
    r = recode.recode_single_genotype(g)
    # Find SNP, sample indices of missing data
    missing = recode.where_missing(r)

    # Generate random multinomial values; map them to genotype codes
    filled_code = multinomial_elementwise(
        scale_row_sums(snp_frequency[missing[SNP]])) + 2

    # Fill-in all genotypes of a certain value in a vectorized manner
    for (genotype, code) in recode.GENOTYPE_CODE.iteritems():
        index = np.where(filled_code == code)[0]
        g[missing[SNP][index], missing[SAMPLE][index], :] = genotype
    return False