def model_dawidskene(mcmc_in, alpha_prior, asymmetric_accuracy=True, hashtag_treatment="strong", draws=500, tune=500): ''' alpha prior = (K,J,2) matrix of pseudocounts for dirichlet if z_obs is present then oracle ''' model = pm.Model() with model: if hashtag_treatment=="strong": rho_prior = np.ones((2,2)) rho_prior[1,1] = 49 rho = pm.Dirichlet('rho', a=rho_prior, shape=(mcmc_in.K,2,2)) z = pm.Categorical('z', p=rho[mcmc_in.kk_lkup, mcmc_in.flag_hashtag], observed=np.ma.masked_values(mcmc_in.z_obs, value=-999), testval=mcmc_in.z_init, shape=mcmc_in.N) elif hashtag_treatment=="weak": rho_prior = np.ones((2,2)) rho = pm.Dirichlet('rho', a=rho_prior, shape=(mcmc_in.K,2,2)) z = pm.Categorical('z', p=rho[mcmc_in.kk_lkup, mcmc_in.flag_hashtag], observed=np.ma.masked_values(mcmc_in.z_obs, value=-999), testval=mcmc_in.z_init, shape=mcmc_in.N) elif hashtag_treatment=="oracle" or hashtag_treatment=="none": rho_prior = np.ones((1,2)) rho = pm.Dirichlet('rho', a=rho_prior, shape=(mcmc_in.K,2)) z = pm.Categorical('z', p=rho[mcmc_in.kk_lkup], observed=np.ma.masked_values(mcmc_in.z_obs, value=-999), testval=mcmc_in.z_init, shape=mcmc_in.N) if asymmetric_accuracy==True: alpha = pm.Dirichlet("alpha", a=alpha_prior, shape=(2,mcmc_in.K,mcmc_in.J,2)) def logp(r, z=z, alpha=alpha): out = T.switch(T.eq(z[mcmc_in.ii],r), T.log(alpha[z[mcmc_in.ii],mcmc_in.kk,mcmc_in.jj,1]), T.log(1-alpha[z[mcmc_in.ii],mcmc_in.kk,mcmc_in.jj,1]) ) return T.sum(out) else: alpha = pm.Dirichlet("alpha", a=alpha_prior, shape=(mcmc_in.K,mcmc_in.J,2)) def logp(r, z=z, alpha=alpha): out = T.switch(T.eq(z[mcmc_in.ii],r), T.log(alpha[mcmc_in.kk,mcmc_in.jj,1]), T.log(1-alpha[mcmc_in.kk,mcmc_in.jj,1]) ) return T.sum(out) r = pm.DensityDist('r', logp, observed=mcmc_in.r_obs, shape=len(mcmc_in.r_obs)) with model: step1 = pm.NUTS(vars=[rho, alpha]) step2 = pm.CategoricalGibbsMetropolis(vars=[z.missing_values]) trace = pm.sample(draws=draws, tune=tune, step=[step1, step2], chains=1) return trace
def _set_steps(self, model, z, *params): with model: self._continuous_step = self.sampler(params) if z is not None: if hasattr(z.distribution, "name") and \ z.distribution.name in [BinaryMRF.NAME, CategoricalMRF.NAME]: self._discrete_step = RandomFieldGibbs([z]) else: self._discrete_step = pm.CategoricalGibbsMetropolis([z]) self._steps = [self._continuous_step, self._discrete_step] else: self._steps = [self._continuous_step] self._model = model
def do_inference(self, draws=20000, tune=2000, init='adapt_diag', **kwargs): if self.model is None: self.build_model() # it's important we now check the model specification, namely do we # have any problems with logp being undefined? with self.model as model: test_point = model.check_test_point() if len(self.model.name) > 0: l_key = self.model.name + '_' else: l_key = '' print(test_point) if np.isnan(test_point['{}Likelihood'.format(l_key)]): print( 'The model\'s test point had an undefined likelihood, meaning sampling will fail' ) sys.exit(0) # Sampling with self.model as model: if self.n_choice is not None: step1 = pm.CategoricalGibbsMetropolis(self.n_choice) step2 = pm.Metropolis([self.a, self.psi, self.sigma_r]) trace = pm.sample(draws=draws, tune=tune, init=init, step=[step1, step2], **kwargs) else: trace = pm.sample(draws=draws, tune=tune, init=init, **kwargs) return trace
def lohhla_clone_model(sample_ids, tree_edges, clonal_prevalence_mat, cellularity, ploidy_values, tumour_sample_reads, normal_sample_reads, integercpn_info, all_genotypes, transition_inputs, stayrate_alpha=0.9, stayrate_beta=0.1, sd=0.5, nb_alpha=0.5, iter_count=20000, tune_iters=20000, anchor_type='nb', anchor_mode='snvcn', nchains=2, njobs=2): ''' stayrate_alpha: Beta prior alpha-parameter on stayrate in clone tree Markov chain stayrate_beta: Beta prior beta-parameter on stayrate in clone tree Markov chain all_genotypes: Dataframe of genotypes, 0-indexed ''' num_nodes = clonal_prevalence_mat.shape[1] valid_transitions = transition_inputs['valid_transitions'] num_transitions = transition_inputs['num_transitions'] num_genotypes = transition_inputs['num_genotypes'] cn_genotype_matrix = transition_inputs['cn_genotype_matrix'] ## Beta-binomial dispersion (higher = less dispersed) dispersion = 200. ## Tree edges edges = tree_edges.as_matrix().astype(int) - 1 with pm.Model() as model: BoundedNormal = pm.Bound(pm.Normal, lower=0., upper=1.) stay_rate = BoundedNormal('stayrate', mu=0.75, sd=0.4) P = np.zeros(shape=(num_genotypes, num_genotypes)) P = P + tt.eye(num_genotypes) * stay_rate fill_values = tt.as_tensor((1. - stay_rate) / num_transitions) fill_values = tt.set_subtensor(fill_values[0], 0) P = P + valid_transitions * fill_values[:, np.newaxis] P = tt.set_subtensor(P[0, 0], 1.) A = tt.dmatrix('A') PA = tt.ones(shape=(num_genotypes)) / num_genotypes states = CloneTreeGenotypes('genotypes', PA=PA, P=P, edges=edges, k=num_genotypes, shape=(num_nodes)) total_cns = theano.shared(np.array(all_genotypes['total_cn'].values)) alt_cns = theano.shared(np.array(all_genotypes['alt_cn'].values)) total_cn = pm.Deterministic('total_cn', total_cns[states]) alt_cn = pm.Deterministic('alt_cn', alt_cns[states]) sample_alt_copies = tt.dot(clonal_prevalence_mat, alt_cn ) * cellularity + (1. - cellularity) * 1. vafs = sample_alt_copies / ( tt.dot(clonal_prevalence_mat, total_cn) * cellularity + (1. - cellularity) * 2.) pm.Deterministic('vafs', vafs) alphas = vafs * dispersion betas = (1 - vafs) * dispersion ## Copy number of tumour cells (aggregated over clones, but not including normal contamination) tutotalcn = pm.Deterministic('tutotalcn', tt.dot(clonal_prevalence_mat, total_cn)) ## Can't be vectorized further for j in range(len(sample_ids)): current_sample = sample_ids[j] total_counts = integercpn_info['TumorCov_type1'][ current_sample].values + integercpn_info['TumorCov_type2'][ current_sample].values alt_counts = integercpn_info['TumorCov_type2'][ current_sample].values alpha_sel = alphas[j] beta_sel = betas[j] ## Draw alternative allele counts for HLA locus for each polymorphic site alt_reads = pm.BetaBinomial('x_' + str(j), alpha=alpha_sel, beta=beta_sel, n=total_counts, observed=alt_counts) mult_factor_mean = (tumour_sample_reads[current_sample] / normal_sample_reads) ploidy = ploidy_values[j] ploidy_ratio = (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2) / ( cellularity[j] * ploidy + (1 - cellularity[j]) * 2) if anchor_mode == 'snvcn': mult_factor_computed = pm.Deterministic( 'mult_factor_computed_' + str(j), 1. / ploidy_ratio * (integercpn_info['Total_TumorCov'][current_sample].values / integercpn_info['Total_NormalCov'][current_sample].values) ) nloci = len( integercpn_info['Total_TumorCov'][current_sample].values) tumour_reads_observed = integercpn_info['Total_TumorCov'][ current_sample].values normal_reads_observed = integercpn_info['Total_NormalCov'][ current_sample].values elif anchor_mode == 'binmedian': binvar_tumour = 'combinedBinTumor' binvar_normal = 'combinedBinNormal' ## All within a bin are the same, so this is OK duplicated_entries = integercpn_info['binNum'][ current_sample].duplicated(keep='first') nloci = len(integercpn_info[binvar_tumour][current_sample] [~duplicated_entries].values) mult_factor_computed = pm.Deterministic( 'mult_factor_computed_' + str(j), (1. / ploidy_ratio * (integercpn_info[binvar_tumour][current_sample] [~duplicated_entries].values / integercpn_info[binvar_normal][current_sample] [~duplicated_entries].values))) tumour_reads_observed = integercpn_info[binvar_tumour][ current_sample][~duplicated_entries].values normal_reads_observed = integercpn_info[binvar_normal][ current_sample][~duplicated_entries].values else: raise Exception("Invalid option specified.") ## Draw ploidy-corrected tumour/normal locus coverage ratio for each polymorphic site if anchor_type == 'mult_factor': mult_factor = pm.Lognormal('mult_factor_' + str(j), mu=np.log(mult_factor_mean), sd=sd, observed=mult_factor_computed, shape=(nloci)) elif anchor_type == 'nb': tc_nc_ratio = pm.Deterministic( 'tc_nc_ratio_' + str(j), (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2) / (ploidy * cellularity[j] + (1 - cellularity[j]) * 2)) tumoursamplecn = pm.Deterministic( 'tumoursamplecn_' + str(j), (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2)) tumour_reads_mean = pm.Deterministic( 'tumour_reads_mean_' + str(j), tc_nc_ratio * mult_factor_mean * normal_reads_observed) tumour_reads = pm.NegativeBinomial( 'tumour_reads_' + str(j), mu=tumour_reads_mean, alpha=nb_alpha, observed=tumour_reads_observed) else: raise Exception('Must specify a valid model type.') pm.Deterministic('log_prob', model.logpt) step1 = pm.CategoricalGibbsMetropolis(vars=[states]) step2 = pm.Metropolis(vars=[stay_rate]) trace = pm.sample(iter_count, tune=tune_iters, step=[step1, step2], njobs=njobs, chains=nchains) return trace
G = pm.Normal('G', mu=np.zeros(num_states - 1), sd=np.ones(num_states - 1) * 10000., shape=(num_states - 1)) states = CommitmentProcess('states', PI=PI, Q=Q, renewal_mask=renewal_mask, num_states=num_states, shape=(num_custs, obs_len), testval=states_test_val) usage = UsageProcess('usage', alpha=A, th0=th0, G=G, states=states, num_states=num_states, shape=(num_custs), observed=observed_usage) start = pm.find_MAP(method='Powell') step1 = pm.Metropolis(vars=[r, PI, Q, A, G, th0, usage]) step2 = pm.CategoricalGibbsMetropolis(vars=[states]) trace = pm.sample(draws, start=start, step=[step1, step2], chains=chains) print('saving to ' + args.output_dir) pm.backends.ndarray.save_trace(trace, directory=args.output_dir, overwrite=True)
def contaminate_mixture(data, fit_for='z', fit_data=None): #stickbreaking problems steps = [] # shapes and sizes n_epochs = data['epoch_i'].max() + 1 # each epoch indexed by epoch_i n_raters = data['rater_i'].max() + 1 n_obs = data.shape[0] # each spindle marker indexed by t # static priors vars trust_purcell = 0.1 # crank up to give more weight to purcell et al, 2017 purcell = np.array([0.3587, 0.6387, 0.0026, 0., 0., 0.]) + (1 - trust_purcell) s_number_prior = purcell / purcell.sum() max_s = len(s_number_prior) - 1 gss_spindle_testvals = [1., 5., 10., 15., 20.] with pm.Model() as model: # True s gss = pm.Uniform('gss', lower=0., upper=25., shape=(n_epochs, max_s), testval=np.tile(np.array(gss_spindle_testvals).T, reps=(n_epochs, 1),)) # Real spindles gss_per_obs = gss[data['epoch_i'], :] # The number of spindles per epoch: if fit_for == 'z': gss_prior = pm.Dirichlet('gss_prior', a=s_number_prior) if n_epochs > 1: z = pm.Categorical('z', p=gss_prior, shape=n_epochs) else: z = pm.Categorical('z', p=gss_prior) else: z = fit_data['z'] z_rs = z.reshape((n_epochs, 1)) if fit_for in ['w', 'z']: # when we are finding z or w w_prior_possibilities = tt.tril(tt.ones((max_s + 1, max_s + 1))) w = pm.Categorical('w', p=w_prior_possibilities[z_rs[data['epoch_i'], 0], :], shape=n_obs) else: # fit for gss w = fit_data['w'] # --- Raters ability to detect markers --- # r_E = pm.Bound(pm.Normal, lower=0.)('r_E', mu=0.5, sd=0.5, shape=n_raters) r_E_per_obs = r_E[data['rater_i']] #r_E = pm.Bound(pm.Normal, lower=0.)('r_E', mu=0.5, sd=0.5) # --- Behaviour --- # contaminate_dist_s = pm.Uniform.dist(lower=0., upper=25., shape=n_obs) contaminate_dist_s.mean = 12.5 possible_dists = [contaminate_dist_s] for i in range(0, 5): dist = pm.Normal.dist(mu=gss_per_obs[:, i], sd=r_E_per_obs) dist.mean = gss_spindle_testvals[i] possible_dists.append(dist) w_array = tt.extra_ops.to_one_hot(w, nb_class=max_s + 1) s = pm.Mixture('s', w=w_array, comp_dists=possible_dists, observed=data['s']) #STEP methods for vars: if fit_for == 'z': steps = [pm.CategoricalGibbsMetropolis([z, w]), pm.NUTS([gss_prior, gss, r_E], target_accept=0.9)] if fit_for == 'w': steps = [pm.CategoricalGibbsMetropolis([w]), pm.NUTS([gss, r_E], target_accept=0.9)] #else, everything NUTS return model, steps
# cluster sizes p = pm.Dirichlet('p', a=np.array([1., 1., 1.]), shape=k) # ensure all clusters have some points p_min_potential = pm.Potential('p_min_potential', tt.switch(tt.min(p) < .1, -np.inf, 0)) # cluster centers means = pm.Normal('means', mu=[0, 0, 0], sigma=15, shape=k) # break symmetry order_means_potential = pm.Potential( 'order_means_potential', tt.switch(means[1] - means[0] < 0, -np.inf, 0) + tt.switch(means[2] - means[1] < 0, -np.inf, 0)) # measurement error sd = pm.Uniform('sd', lower=0, upper=20) # latent cluster of each observation category = pm.Categorical('category', p=p, shape=ndata) # likelihood for each observed value points = pm.Normal('obs', mu=means[category], sigma=sd, observed=data) """ Fit Model """ with model: step1 = pm.Metropolis(vars=[p, sd, means]) step2 = pm.CategoricalGibbsMetropolis(vars=[category]) tr = pm.sample(10000, step=[step1, step2], tune=5000)