def test_dishonest_casino_larger_transition_p(self): '''Dishonest Casino Example.''' # Create transition probability matrix A = np.array([[0.9, 0.1], [0.1, 0.9]]) # Create observable probability distribution matrix. Casino biased toward "6" in state "1" B = statutil.scale_row_sums(np.array([[ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ], [ 1.0, 1.0, 1.0, 1.0, 1.0, 5.0 ]])) # Create set of all observable symbols V = [1, 2, 3, 4, 5, 6] # Instantiate an HMM, note Pi is uniform probability distribution by default m = hmm.HMM(2, A=A, B=B, V=V) Obs = [ 1, 2, 3, 4, 5, 2, 1, 6, 6, 6, 5, 6 ] log_prob_Obs, Alpha, c = hmm.forward(m, Obs, scaling=1) assert_almost_equal(log_prob_Obs, -20.124, decimal=3, err_msg='Wrong observation probability') Q_star, _, _ = hmm.viterbi(m, Obs, scaling=1) assert_equal(Q_star, [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], err_msg='Wrong Viterbi path') Beta = hmm.backward(m, Obs, c) Gamma, Q_star = hmm.individually_optimal_states(Alpha, Beta) assert_almost_equal(Gamma, [[0.8189770516168013, 0.8482906260695058, 0.8525027084764197, 0.8329611652077556, 0.7834127024175411, 0.6880018120129073, 0.5161970090643716, 0.2130207566284025, 0.12024202874950358, 0.10797060639721641, 0.15902649827833876, 0.14930464162738483], [0.18102294838319855, 0.15170937393049422, 0.14749729152358024, 0.16703883479224435, 0.21658729758245884, 0.31199818798709256, 0.4838029909356284, 0.7869792433715975, 0.8797579712504964, 0.8920293936027837, 0.8409735017216613, 0.8506953583726152]], decimal=5, err_msg='Wrong state probabilities') assert_equal(Q_star, [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], 'Wrong individually-optimal states')
def test_dishonest_casino(self): '''Dishonest Casino Example.''' # Create transition probability matrix A = np.array([[0.99, 0.01], [0.01, 0.99]]) # Create observable probability distribution matrix. Casino biased toward "6" in state "1". B = statutil.scale_row_sums(np.array([[ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ], [ 1.0, 1.0, 1.0, 1.0, 1.0, 5.0 ]])) # Create set of all observable symbols V = [1, 2, 3, 4, 5, 6] # Instantiate an HMM, note Pi is uniform probability distribution by default m = hmm.HMM(2, A=A, B=B, V=V) Obs = [ 1, 2, 3, 4, 5, 2, 1, 6, 6, 6, 5, 6 ] log_prob_Obs, Alpha, c = hmm.forward(m, Obs, scaling=1) assert_almost_equal(log_prob_Obs, -20.9468006, decimal=5, err_msg='Wrong observation probability') Q_star, _, _ = hmm.viterbi(m, Obs, scaling=1) assert_equal(Q_star, [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'Wrong Viterbi path') Beta = hmm.backward(m, Obs, c) Gamma, Q_star = hmm.individually_optimal_states(Alpha, Beta) assert_almost_equal(Gamma, [[0.63711364302936, 0.6348934929050587, 0.6271179131667495, 0.6117100305977996, 0.5845543683193845, 0.5383975935172204, 0.46091113744414974, 0.3313982095474306, 0.28864618346708165, 0.27562909135388625, 0.27498372625848855, 0.26932891011973825], [0.36288635697064003, 0.3651065070949412, 0.3728820868332506, 0.38828996940220045, 0.4154456316806155, 0.4616024064827796, 0.5390888625558502, 0.6686017904525694, 0.7113538165329184, 0.7243709086461138, 0.7250162737415115, 0.7306710898802617]], decimal=5, err_msg='Wrong state probabilities') assert_equal(Q_star, [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1], 'Wrong individually-optimal states')
def __init__(self, i, T, id_coef_file, e=0.01, debug=False, alpha=1.0): '''e=Desired relative error in call rate estimates.''' params = im.phase.PhaseParam(id_coef_file=id_coef_file) Delta = np.array([params.id_coefs(i, j)[1] for j in T]) # p=row-stochastic matrix. Row j is the probability density of the detailed identity state # between i and j self.p = np.tile(SUB_PROB, (len(T), 1)) * Delta[:, CONDENSED_STATE] # Delta's might only approximately sum to 1, scale p to be a pdf self.p = statutil.scale_row_sums(self.p) self.debug = debug # Estimated # simulations for desired accuracy by the central-limit theorem self.num_simulations = int(np.ceil(1. / e**2)) self.reset()
def __init__(self, i, T, id_coef_file, e=0.01, debug=False, alpha=1.0): '''e=Desired relative error in call rate estimates.''' params = im.phase.PhaseParam(id_coef_file=id_coef_file) Delta = np.array([params.id_coefs(i, j)[1] for j in T]) # p=row-stochastic matrix. Row j is the probability density of the detailed identity state # between i and j self.p = np.tile(SUB_PROB, (len(T), 1)) * Delta[:, CONDENSED_STATE] # Delta's might only approximately sum to 1, scale p to be a pdf self.p = statutil.scale_row_sums(self.p) self.debug = debug # Estimated # simulations for desired accuracy by the central-limit theorem self.num_simulations = int(np.ceil(1. / e ** 2)) self.reset()
def __handle_estimate_genotype_frequencies(self, request): """Estimate genotype frequencies from the genotype data and save them in ProblemInfo.""" # Load problem fields problem = request.problem snp_metadata = problem.info.snp snp_count = snp_metadata["count"] # Recode genotypes to a single number r = recode.recode_single_genotype(problem.genotype.data) # Count genotype appearances for each SNP, and save in SNP annotation array. # The frequency table column order matches the GENOTYPE_CODE array. This includes filled # and missing genotypes: (1,1),(1,2),(2,2),(0,0). for col, genotype_code in enumerate(recode.GENOTYPE_CODE.itervalues()): snp_count[:, col] = statutil.hist(np.where(r == genotype_code)[0], problem.num_snps) # Calculate frequencies snp_metadata["frequency"] = statutil.scale_row_sums(snp_count.astype("float")) return False
def test_train_model(self): '''Dishonest Casino Example - EM algorithm.''' # Create transition probability matrix A = np.array([[0.99, 0.01], [0.01, 0.99]]) # Create observable probability distribution matrix. Casino biased toward "6" in state "1". B = statutil.scale_row_sums(np.array([[ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ], [ 1.0, 1.0, 1.0, 1.0, 1.0, 5.0 ]])) # Create set of all observable symbols V = [1, 2, 3, 4, 5, 6] # Instantiate an HMM, note Pi is uniform probability distribution by default m = hmm.HMM(2, A=A, B=B, V=V) Obs = [ 1, 2, 3, 4, 5, 2, 1, 6, 6, 6, 5, 6 ] c = [Obs] hmm.baum_welch(m, c, epochs=15, graph=False) TestHmm.assert_model_matrices_almost_equal(m, ([[0.856658708052639, 0.14334129194736125], [2.454940916925095e-16, 1.0]], [[0.28329354031233306, 0.2866825838637413, 0.14334129194736112, 0.14334129194736112, 0.14334129192821368, 9.896623857864685e-13], [0.004706380704415612, 4.3023359620169447e-11, 3.2510873580469717e-111, 1.2201233032249015e-54, 0.19905872387205914, 0.7962348953805019]], [1.0, 4.364785210913299e-122]))
def __handle_estimate_genotype_frequencies(self, request): '''Estimate genotype frequencies from the genotype data and save them in ProblemInfo.''' # Load problem fields problem = request.problem snp_metadata = problem.info.snp snp_count = snp_metadata['count'] # Recode genotypes to a single number r = recode.recode_single_genotype(problem.genotype.data) # Count genotype appearances for each SNP, and save in SNP annotation array. # The frequency table column order matches the GENOTYPE_CODE array. This includes filled # and missing genotypes: (1,1),(1,2),(2,2),(0,0). for col, genotype_code in enumerate(recode.GENOTYPE_CODE.itervalues()): snp_count[:, col] = statutil.hist( np.where(r == genotype_code)[0], problem.num_snps) # Calculate frequencies snp_metadata['frequency'] = statutil.scale_row_sums( snp_count.astype('float')) return False
def __handle_fill_missing_genotypes(self, request): '''Fill missing genotype entries by randomly sampling from the multinomial distribution with estimated genotype frequencies at the corresponding SNP.''' # Load problem fields if request.params.debug: print 'Filling missing genotypes from estimated genotype distribution' problem = request.problem g = problem.genotype.data snp_frequency = problem.info.snp['frequency'][:, FILLED_GENOTYPES] # Recode genotypes to a single number r = recode.recode_single_genotype(g) # Find SNP, sample indices of missing data missing = recode.where_missing(r) # Generate random multinomial values; map them to genotype codes filled_code = multinomial_elementwise(scale_row_sums(snp_frequency[missing[SNP]])) + 2 # Fill-in all genotypes of a certain value in a vectorized manner for (genotype, code) in recode.GENOTYPE_CODE.iteritems(): index = np.where(filled_code == code)[0] g[missing[SNP][index], missing[SAMPLE][index], :] = genotype return False
def __handle_fill_missing_genotypes(self, request): '''Fill missing genotype entries by randomly sampling from the multinomial distribution with estimated genotype frequencies at the corresponding SNP.''' # Load problem fields if request.params.debug: print 'Filling missing genotypes from estimated genotype distribution' problem = request.problem g = problem.genotype.data snp_frequency = problem.info.snp['frequency'][:, FILLED_GENOTYPES] # Recode genotypes to a single number r = recode.recode_single_genotype(g) # Find SNP, sample indices of missing data missing = recode.where_missing(r) # Generate random multinomial values; map them to genotype codes filled_code = multinomial_elementwise( scale_row_sums(snp_frequency[missing[SNP]])) + 2 # Fill-in all genotypes of a certain value in a vectorized manner for (genotype, code) in recode.GENOTYPE_CODE.iteritems(): index = np.where(filled_code == code)[0] g[missing[SNP][index], missing[SAMPLE][index], :] = genotype return False
def B(t): # 1.0reate observable probability distribution matrix. 1.0asino biased toward "6" in state "1". return statutil.scale_row_sums(np.array([[ 1.0, 1.0, 1.0, 1.0, 1.0, 1.0 ], [ 1.0 - 1.0 / (t + D), 1.0, 1.0, 1.0, 1.0, 5.0 + 1.0 / (t + D) ]]))