def test_multinomial_elementwise_distribution(self): '''Verify that the created variables approach a multinomial distribution for large numbers of samples.''' (m, n, k) = (6, 5, 1) r = 2 ** np.arange(4, 17) p = statutil.random_row_stochastic((m, n)) #p = statutil.scale_row_sums(np.ones((m, n))) error = np.zeros((len(r),)) for (i, r_val) in enumerate(r): for _ in xrange(k): x = statutil.multinomial_elementwise(p, r_val) # Root-mean-square-error of observed frequencies w.r.t. desired frequencies error[i] += statutil.norm_frobenius_scaled(statutil.hist(x, n) / (1.0 * r_val) - p) error[i] /= (1.0 * k) # Validate the model error of the central limit theorem: C*r^(-0.5). # This is a consequence of the Central Limit Theorem. We are making k experiments for # each value of n. Even if k=1, there's a 95% chance that we are within ~1.6 standard deviations # from the mean of the normal distribution sqrt(n)*[observed freq variable - p[i,j]] for each # entry j of a row i of the matrix p. So if row i's stddev is s[i], the sum of squared errors # should be (with 95% confidence) <= n * (1.96*s[i])^2. So # C <= sqrt(sum(n * (1.5*s[i])^2)_i / (m*n)) = 1.96 * sqrt(s[i]^2/m). # See http://en.wikipedia.org/wiki/Central_limit_theorem alpha, c, r_value, _, _ = linregress(np.log(r), np.log(error)) c = np.exp(c) # print c , 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - # np.sum(p * np.arange(p.shape[1]), axis=1) ** 2, # 2) / np.sqrt(p.shape[0]), assert_almost_equal(alpha, -0.5, decimal=1, err_msg='Unexpected error term growth power') self.assertTrue(c <= 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - np.sum(p * np.arange(p.shape[1]), axis=1) ** 2, 2) / np.sqrt(p.shape[0]), 'Error term coefficient outside 95% confidence interval') self.assertTrue(abs(r_value) > 0.99, 'Error does not fit a power law in sample size')
def test_multinomial_elementwise_vector(self): '''Test creating multinomial variables (r=1).''' (m, n) = (20, 5) p = statutil.random_row_stochastic((m, n)) x = statutil.multinomial_elementwise(p) assert_equal(x, [3, 1, 3, 4, 0, 2, 2, 3, 1, 4, 2, 3, 2, 1, 4, 2, 4, 3, 4, 3], 'Wrong random multinomial generated')
def test_multinomial_elementwise_vector(self): '''Test creating multinomial variables (r=1).''' (m, n) = (20, 5) p = statutil.random_row_stochastic((m, n)) x = statutil.multinomial_elementwise(p) assert_equal( x, [3, 1, 3, 4, 0, 2, 2, 3, 1, 4, 2, 3, 2, 1, 4, 2, 4, 3, 4, 3], 'Wrong random multinomial generated')
def test_multinomial_elementwise_matrix(self): '''Test creating multinomial variables (r > 1).''' (m, n) = (20, 5) p = statutil.random_row_stochastic((m, n)) x = statutil.multinomial_elementwise(p, 2) assert_equal(x, [[3, 3], [1, 2], [3, 3], [4, 3], [0, 1], [2, 2], [2, 0], [3, 1], [1, 2], [4, 0], [2, 2], [3, 1], [2, 1], [1, 1], [4, 2], [2, 3], [4, 3], [3, 4], [4, 3], [3, 1]], 'Wrong random multinomial generated')
def _simulate_imputation(self): '''Simulate imputing i from phased T-haplotypes.''' x = IMPUTABLE[statutil.multinomial_elementwise(self.p)] a0 = np.any(x / 2) # Bit 0 set for at least one element of x ==> allele 0 is imputable a1 = np.any(x % 2) # Bit 1 set for at least one element of x ==> allele 1 is imputable if a0 & a1: # Both alleles are imputable self.count_full += 1 if a0 | a1: # Either allele is imputable self.count_part += 1 self.count += 1
def _simulate_imputation(self): '''Simulate imputing i from phased T-haplotypes.''' x = IMPUTABLE[statutil.multinomial_elementwise(self.p)] a0 = np.any( x / 2 ) # Bit 0 set for at least one element of x ==> allele 0 is imputable a1 = np.any( x % 2 ) # Bit 1 set for at least one element of x ==> allele 1 is imputable if a0 & a1: # Both alleles are imputable self.count_full += 1 if a0 | a1: # Either allele is imputable self.count_part += 1 self.count += 1
def test_multinomial_elementwise_distribution(self): '''Verify that the created variables approach a multinomial distribution for large numbers of samples.''' (m, n, k) = (6, 5, 1) r = 2**np.arange(4, 17) p = statutil.random_row_stochastic((m, n)) #p = statutil.scale_row_sums(np.ones((m, n))) error = np.zeros((len(r), )) for (i, r_val) in enumerate(r): for _ in xrange(k): x = statutil.multinomial_elementwise(p, r_val) # Root-mean-square-error of observed frequencies w.r.t. desired frequencies error[i] += statutil.norm_frobenius_scaled( statutil.hist(x, n) / (1.0 * r_val) - p) error[i] /= (1.0 * k) # Validate the model error of the central limit theorem: C*r^(-0.5). # This is a consequence of the Central Limit Theorem. We are making k experiments for # each value of n. Even if k=1, there's a 95% chance that we are within ~1.6 standard deviations # from the mean of the normal distribution sqrt(n)*[observed freq variable - p[i,j]] for each # entry j of a row i of the matrix p. So if row i's stddev is s[i], the sum of squared errors # should be (with 95% confidence) <= n * (1.96*s[i])^2. So # C <= sqrt(sum(n * (1.5*s[i])^2)_i / (m*n)) = 1.96 * sqrt(s[i]^2/m). # See http://en.wikipedia.org/wiki/Central_limit_theorem alpha, c, r_value, _, _ = linregress(np.log(r), np.log(error)) c = np.exp(c) # print c , 1.96 * np.linalg.linalg.norm(np.sum(p * np.arange(p.shape[1]) ** 2, axis=1) - # np.sum(p * np.arange(p.shape[1]), axis=1) ** 2, # 2) / np.sqrt(p.shape[0]), assert_almost_equal(alpha, -0.5, decimal=1, err_msg='Unexpected error term growth power') self.assertTrue( c <= 1.96 * np.linalg.linalg.norm( np.sum(p * np.arange(p.shape[1])**2, axis=1) - np.sum(p * np.arange(p.shape[1]), axis=1)**2, 2) / np.sqrt(p.shape[0]), 'Error term coefficient outside 95% confidence interval') self.assertTrue( abs(r_value) > 0.99, 'Error does not fit a power law in sample size')
def __handle_fill_missing_genotypes(self, request): '''Fill missing genotype entries by randomly sampling from the multinomial distribution with estimated genotype frequencies at the corresponding SNP.''' # Load problem fields if request.params.debug: print 'Filling missing genotypes from estimated genotype distribution' problem = request.problem g = problem.genotype.data snp_frequency = problem.info.snp['frequency'][:, FILLED_GENOTYPES] # Recode genotypes to a single number r = recode.recode_single_genotype(g) # Find SNP, sample indices of missing data missing = recode.where_missing(r) # Generate random multinomial values; map them to genotype codes filled_code = multinomial_elementwise(scale_row_sums(snp_frequency[missing[SNP]])) + 2 # Fill-in all genotypes of a certain value in a vectorized manner for (genotype, code) in recode.GENOTYPE_CODE.iteritems(): index = np.where(filled_code == code)[0] g[missing[SNP][index], missing[SAMPLE][index], :] = genotype return False
def __handle_fill_missing_genotypes(self, request): '''Fill missing genotype entries by randomly sampling from the multinomial distribution with estimated genotype frequencies at the corresponding SNP.''' # Load problem fields if request.params.debug: print 'Filling missing genotypes from estimated genotype distribution' problem = request.problem g = problem.genotype.data snp_frequency = problem.info.snp['frequency'][:, FILLED_GENOTYPES] # Recode genotypes to a single number r = recode.recode_single_genotype(g) # Find SNP, sample indices of missing data missing = recode.where_missing(r) # Generate random multinomial values; map them to genotype codes filled_code = multinomial_elementwise( scale_row_sums(snp_frequency[missing[SNP]])) + 2 # Fill-in all genotypes of a certain value in a vectorized manner for (genotype, code) in recode.GENOTYPE_CODE.iteritems(): index = np.where(filled_code == code)[0] g[missing[SNP][index], missing[SAMPLE][index], :] = genotype return False