示例#1
0
def model_dawidskene(mcmc_in, alpha_prior, asymmetric_accuracy=True, hashtag_treatment="strong", draws=500, tune=500):
    '''
    alpha prior = (K,J,2) matrix of pseudocounts for dirichlet
    if z_obs is present then oracle
    '''
    model = pm.Model()

    with model:
        if hashtag_treatment=="strong":
            rho_prior = np.ones((2,2))
            rho_prior[1,1] = 49
            rho = pm.Dirichlet('rho', a=rho_prior, shape=(mcmc_in.K,2,2))
            z = pm.Categorical('z', 
                                p=rho[mcmc_in.kk_lkup, mcmc_in.flag_hashtag], 
                                observed=np.ma.masked_values(mcmc_in.z_obs, value=-999),
                                testval=mcmc_in.z_init,
                                shape=mcmc_in.N)
        elif hashtag_treatment=="weak":
            rho_prior = np.ones((2,2))
            rho = pm.Dirichlet('rho', a=rho_prior, shape=(mcmc_in.K,2,2))
            z = pm.Categorical('z', 
                                p=rho[mcmc_in.kk_lkup, mcmc_in.flag_hashtag], 
                                observed=np.ma.masked_values(mcmc_in.z_obs, value=-999),
                                testval=mcmc_in.z_init,
                                shape=mcmc_in.N)
        elif hashtag_treatment=="oracle" or hashtag_treatment=="none":
            rho_prior = np.ones((1,2))
            rho = pm.Dirichlet('rho', a=rho_prior, shape=(mcmc_in.K,2))
            z = pm.Categorical('z', 
                                p=rho[mcmc_in.kk_lkup], 
                                observed=np.ma.masked_values(mcmc_in.z_obs, value=-999),
                                testval=mcmc_in.z_init,
                                shape=mcmc_in.N)
        
        if asymmetric_accuracy==True:
            alpha = pm.Dirichlet("alpha", a=alpha_prior, shape=(2,mcmc_in.K,mcmc_in.J,2))
            def logp(r, z=z, alpha=alpha):
                out = T.switch(T.eq(z[mcmc_in.ii],r),
                               T.log(alpha[z[mcmc_in.ii],mcmc_in.kk,mcmc_in.jj,1]),
                               T.log(1-alpha[z[mcmc_in.ii],mcmc_in.kk,mcmc_in.jj,1])
                               )
                return T.sum(out)
        else:
            alpha = pm.Dirichlet("alpha", a=alpha_prior, shape=(mcmc_in.K,mcmc_in.J,2))
            def logp(r, z=z, alpha=alpha):
                out = T.switch(T.eq(z[mcmc_in.ii],r),
                               T.log(alpha[mcmc_in.kk,mcmc_in.jj,1]),
                               T.log(1-alpha[mcmc_in.kk,mcmc_in.jj,1])
                               )
                return T.sum(out)
        r = pm.DensityDist('r', logp, observed=mcmc_in.r_obs, shape=len(mcmc_in.r_obs))

    with model:
        step1 = pm.NUTS(vars=[rho, alpha])
        step2 = pm.CategoricalGibbsMetropolis(vars=[z.missing_values])
        trace = pm.sample(draws=draws, tune=tune, step=[step1, step2], chains=1)
        
    return trace
示例#2
0
 def _set_steps(self, model, z, *params):
     with model:
         self._continuous_step = self.sampler(params)
         if z is not None:
             if hasattr(z.distribution, "name") and \
               z.distribution.name in [BinaryMRF.NAME, CategoricalMRF.NAME]:
                 self._discrete_step = RandomFieldGibbs([z])
             else:
                 self._discrete_step = pm.CategoricalGibbsMetropolis([z])
             self._steps = [self._continuous_step, self._discrete_step]
         else:
             self._steps = [self._continuous_step]
     self._model = model
    def do_inference(self,
                     draws=20000,
                     tune=2000,
                     init='adapt_diag',
                     **kwargs):
        if self.model is None:
            self.build_model()

        # it's important we now check the model specification, namely do we
        # have any problems with logp being undefined?
        with self.model as model:
            test_point = model.check_test_point()

            if len(self.model.name) > 0:
                l_key = self.model.name + '_'
            else:
                l_key = ''

        print(test_point)
        if np.isnan(test_point['{}Likelihood'.format(l_key)]):
            print(
                'The model\'s test point had an undefined likelihood, meaning sampling will fail'
            )
            sys.exit(0)

        # Sampling
        with self.model as model:
            if self.n_choice is not None:
                step1 = pm.CategoricalGibbsMetropolis(self.n_choice)
                step2 = pm.Metropolis([self.a, self.psi, self.sigma_r])
                trace = pm.sample(draws=draws,
                                  tune=tune,
                                  init=init,
                                  step=[step1, step2],
                                  **kwargs)
            else:
                trace = pm.sample(draws=draws, tune=tune, init=init, **kwargs)
        return trace
def lohhla_clone_model(sample_ids,
                       tree_edges,
                       clonal_prevalence_mat,
                       cellularity,
                       ploidy_values,
                       tumour_sample_reads,
                       normal_sample_reads,
                       integercpn_info,
                       all_genotypes,
                       transition_inputs,
                       stayrate_alpha=0.9,
                       stayrate_beta=0.1,
                       sd=0.5,
                       nb_alpha=0.5,
                       iter_count=20000,
                       tune_iters=20000,
                       anchor_type='nb',
                       anchor_mode='snvcn',
                       nchains=2,
                       njobs=2):
    '''
    stayrate_alpha: Beta prior alpha-parameter on stayrate in clone tree Markov chain
    stayrate_beta: Beta prior beta-parameter on stayrate in clone tree Markov chain
    all_genotypes: Dataframe of genotypes, 0-indexed
    '''
    num_nodes = clonal_prevalence_mat.shape[1]

    valid_transitions = transition_inputs['valid_transitions']
    num_transitions = transition_inputs['num_transitions']
    num_genotypes = transition_inputs['num_genotypes']
    cn_genotype_matrix = transition_inputs['cn_genotype_matrix']

    ## Beta-binomial dispersion (higher = less dispersed)
    dispersion = 200.

    ## Tree edges
    edges = tree_edges.as_matrix().astype(int) - 1

    with pm.Model() as model:
        BoundedNormal = pm.Bound(pm.Normal, lower=0., upper=1.)
        stay_rate = BoundedNormal('stayrate', mu=0.75, sd=0.4)

        P = np.zeros(shape=(num_genotypes, num_genotypes))
        P = P + tt.eye(num_genotypes) * stay_rate

        fill_values = tt.as_tensor((1. - stay_rate) / num_transitions)
        fill_values = tt.set_subtensor(fill_values[0], 0)

        P = P + valid_transitions * fill_values[:, np.newaxis]
        P = tt.set_subtensor(P[0, 0], 1.)

        A = tt.dmatrix('A')

        PA = tt.ones(shape=(num_genotypes)) / num_genotypes

        states = CloneTreeGenotypes('genotypes',
                                    PA=PA,
                                    P=P,
                                    edges=edges,
                                    k=num_genotypes,
                                    shape=(num_nodes))

        total_cns = theano.shared(np.array(all_genotypes['total_cn'].values))
        alt_cns = theano.shared(np.array(all_genotypes['alt_cn'].values))

        total_cn = pm.Deterministic('total_cn', total_cns[states])
        alt_cn = pm.Deterministic('alt_cn', alt_cns[states])

        sample_alt_copies = tt.dot(clonal_prevalence_mat, alt_cn
                                   ) * cellularity + (1. - cellularity) * 1.

        vafs = sample_alt_copies / (
            tt.dot(clonal_prevalence_mat, total_cn) * cellularity +
            (1. - cellularity) * 2.)
        pm.Deterministic('vafs', vafs)

        alphas = vafs * dispersion
        betas = (1 - vafs) * dispersion

        ## Copy number of tumour cells (aggregated over clones, but not including normal contamination)
        tutotalcn = pm.Deterministic('tutotalcn',
                                     tt.dot(clonal_prevalence_mat, total_cn))

        ## Can't be vectorized further
        for j in range(len(sample_ids)):
            current_sample = sample_ids[j]
            total_counts = integercpn_info['TumorCov_type1'][
                current_sample].values + integercpn_info['TumorCov_type2'][
                    current_sample].values
            alt_counts = integercpn_info['TumorCov_type2'][
                current_sample].values
            alpha_sel = alphas[j]
            beta_sel = betas[j]

            ## Draw alternative allele counts for HLA locus for each polymorphic site
            alt_reads = pm.BetaBinomial('x_' + str(j),
                                        alpha=alpha_sel,
                                        beta=beta_sel,
                                        n=total_counts,
                                        observed=alt_counts)

            mult_factor_mean = (tumour_sample_reads[current_sample] /
                                normal_sample_reads)

            ploidy = ploidy_values[j]
            ploidy_ratio = (tutotalcn[j] * cellularity[j] +
                            (1 - cellularity[j]) * 2) / (
                                cellularity[j] * ploidy +
                                (1 - cellularity[j]) * 2)
            if anchor_mode == 'snvcn':
                mult_factor_computed = pm.Deterministic(
                    'mult_factor_computed_' + str(j), 1. / ploidy_ratio *
                    (integercpn_info['Total_TumorCov'][current_sample].values /
                     integercpn_info['Total_NormalCov'][current_sample].values)
                )
                nloci = len(
                    integercpn_info['Total_TumorCov'][current_sample].values)

                tumour_reads_observed = integercpn_info['Total_TumorCov'][
                    current_sample].values
                normal_reads_observed = integercpn_info['Total_NormalCov'][
                    current_sample].values
            elif anchor_mode == 'binmedian':
                binvar_tumour = 'combinedBinTumor'
                binvar_normal = 'combinedBinNormal'
                ## All within a bin are the same, so this is OK
                duplicated_entries = integercpn_info['binNum'][
                    current_sample].duplicated(keep='first')
                nloci = len(integercpn_info[binvar_tumour][current_sample]
                            [~duplicated_entries].values)

                mult_factor_computed = pm.Deterministic(
                    'mult_factor_computed_' + str(j),
                    (1. / ploidy_ratio *
                     (integercpn_info[binvar_tumour][current_sample]
                      [~duplicated_entries].values /
                      integercpn_info[binvar_normal][current_sample]
                      [~duplicated_entries].values)))

                tumour_reads_observed = integercpn_info[binvar_tumour][
                    current_sample][~duplicated_entries].values
                normal_reads_observed = integercpn_info[binvar_normal][
                    current_sample][~duplicated_entries].values
            else:
                raise Exception("Invalid option specified.")

            ## Draw ploidy-corrected tumour/normal locus coverage ratio for each polymorphic site

            if anchor_type == 'mult_factor':
                mult_factor = pm.Lognormal('mult_factor_' + str(j),
                                           mu=np.log(mult_factor_mean),
                                           sd=sd,
                                           observed=mult_factor_computed,
                                           shape=(nloci))
            elif anchor_type == 'nb':
                tc_nc_ratio = pm.Deterministic(
                    'tc_nc_ratio_' + str(j), (tutotalcn[j] * cellularity[j] +
                                              (1 - cellularity[j]) * 2) /
                    (ploidy * cellularity[j] + (1 - cellularity[j]) * 2))

                tumoursamplecn = pm.Deterministic(
                    'tumoursamplecn_' + str(j),
                    (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2))

                tumour_reads_mean = pm.Deterministic(
                    'tumour_reads_mean_' + str(j),
                    tc_nc_ratio * mult_factor_mean * normal_reads_observed)

                tumour_reads = pm.NegativeBinomial(
                    'tumour_reads_' + str(j),
                    mu=tumour_reads_mean,
                    alpha=nb_alpha,
                    observed=tumour_reads_observed)
            else:
                raise Exception('Must specify a valid model type.')

        pm.Deterministic('log_prob', model.logpt)

        step1 = pm.CategoricalGibbsMetropolis(vars=[states])
        step2 = pm.Metropolis(vars=[stay_rate])

        trace = pm.sample(iter_count,
                          tune=tune_iters,
                          step=[step1, step2],
                          njobs=njobs,
                          chains=nchains)

        return trace
示例#5
0
    G = pm.Normal('G',
                  mu=np.zeros(num_states - 1),
                  sd=np.ones(num_states - 1) * 10000.,
                  shape=(num_states - 1))

    states = CommitmentProcess('states',
                               PI=PI,
                               Q=Q,
                               renewal_mask=renewal_mask,
                               num_states=num_states,
                               shape=(num_custs, obs_len),
                               testval=states_test_val)
    usage = UsageProcess('usage',
                         alpha=A,
                         th0=th0,
                         G=G,
                         states=states,
                         num_states=num_states,
                         shape=(num_custs),
                         observed=observed_usage)

    start = pm.find_MAP(method='Powell')
    step1 = pm.Metropolis(vars=[r, PI, Q, A, G, th0, usage])
    step2 = pm.CategoricalGibbsMetropolis(vars=[states])
    trace = pm.sample(draws, start=start, step=[step1, step2], chains=chains)

print('saving to ' + args.output_dir)
pm.backends.ndarray.save_trace(trace,
                               directory=args.output_dir,
                               overwrite=True)
示例#6
0
def contaminate_mixture(data, fit_for='z', fit_data=None): #stickbreaking problems
    steps = []
    # shapes and sizes
    n_epochs = data['epoch_i'].max() + 1  # each epoch indexed by epoch_i
    n_raters = data['rater_i'].max() + 1
    n_obs = data.shape[0]  # each spindle marker indexed by t

    # static priors vars
    trust_purcell = 0.1  # crank up to give more weight to purcell et al, 2017
    purcell = np.array([0.3587, 0.6387, 0.0026, 0., 0., 0.]) + (1 - trust_purcell)
    s_number_prior = purcell / purcell.sum()
    max_s = len(s_number_prior) - 1
    gss_spindle_testvals = [1., 5., 10., 15., 20.]
    with pm.Model() as model:

        # True s
        gss = pm.Uniform('gss', lower=0., upper=25., shape=(n_epochs, max_s),
                         testval=np.tile(np.array(gss_spindle_testvals).T, reps=(n_epochs, 1),))  # Real spindles
        gss_per_obs = gss[data['epoch_i'], :]

        # The number of spindles per epoch:
        if fit_for == 'z':
            gss_prior = pm.Dirichlet('gss_prior', a=s_number_prior)
            if n_epochs > 1:
                z = pm.Categorical('z', p=gss_prior,
                                   shape=n_epochs)
            else:
                z = pm.Categorical('z', p=gss_prior)
        else:
            z = fit_data['z']
        z_rs = z.reshape((n_epochs, 1))

        if fit_for in ['w', 'z']:  # when we are finding z or w
            w_prior_possibilities = tt.tril(tt.ones((max_s + 1, max_s + 1)))
            w = pm.Categorical('w', p=w_prior_possibilities[z_rs[data['epoch_i'], 0], :], shape=n_obs)
        else:  # fit for gss
            w = fit_data['w']

        # --- Raters ability to detect markers --- #
        r_E = pm.Bound(pm.Normal, lower=0.)('r_E', mu=0.5, sd=0.5, shape=n_raters)
        r_E_per_obs = r_E[data['rater_i']]
        #r_E = pm.Bound(pm.Normal, lower=0.)('r_E', mu=0.5, sd=0.5)

        # --- Behaviour --- #
        contaminate_dist_s = pm.Uniform.dist(lower=0., upper=25., shape=n_obs)
        contaminate_dist_s.mean = 12.5
        possible_dists = [contaminate_dist_s]
        for i in range(0, 5):
            dist = pm.Normal.dist(mu=gss_per_obs[:, i], sd=r_E_per_obs)
            dist.mean = gss_spindle_testvals[i]
            possible_dists.append(dist)

        w_array = tt.extra_ops.to_one_hot(w, nb_class=max_s + 1)
        s = pm.Mixture('s', w=w_array,
                       comp_dists=possible_dists,
                       observed=data['s'])

        #STEP methods for vars:
        if fit_for == 'z':
            steps = [pm.CategoricalGibbsMetropolis([z, w]),
                     pm.NUTS([gss_prior, gss, r_E], target_accept=0.9)]
        if fit_for == 'w':
            steps = [pm.CategoricalGibbsMetropolis([w]),
                     pm.NUTS([gss, r_E], target_accept=0.9)]
        #else, everything NUTS

    return model, steps
示例#7
0
    # cluster sizes
    p = pm.Dirichlet('p', a=np.array([1., 1., 1.]), shape=k)

    # ensure all clusters have some points
    p_min_potential = pm.Potential('p_min_potential',
                                   tt.switch(tt.min(p) < .1, -np.inf, 0))

    # cluster centers
    means = pm.Normal('means', mu=[0, 0, 0], sigma=15, shape=k)

    # break symmetry
    order_means_potential = pm.Potential(
        'order_means_potential',
        tt.switch(means[1] - means[0] < 0, -np.inf, 0) +
        tt.switch(means[2] - means[1] < 0, -np.inf, 0))

    # measurement error
    sd = pm.Uniform('sd', lower=0, upper=20)

    # latent cluster of each observation
    category = pm.Categorical('category', p=p, shape=ndata)

    # likelihood for each observed value
    points = pm.Normal('obs', mu=means[category], sigma=sd, observed=data)
""" Fit Model """

with model:
    step1 = pm.Metropolis(vars=[p, sd, means])
    step2 = pm.CategoricalGibbsMetropolis(vars=[category])
    tr = pm.sample(10000, step=[step1, step2], tune=5000)