Exemplo n.º 1
0
def testNegativeBinominal():
    x_lim = 60
    burnin = 50000
    with pm.Model() as model:
        alpha = pm.Exponential('alpha', lam=0.2)
        mu = pm.Uniform('mu', lower=0, upper=100)
        y_pred = pm.NegativeBinomial('y_pred', mu=mu, alpha=alpha)
        # 这个y_esti是什么,有什么用?
        y_esti = pm.NegativeBinomial('y_esti',
                                     mu=mu,
                                     alpha=alpha,
                                     observed=msg['time_delay_seconds'].values)
        start = pm.find_MAP()
        step = pm.Metropolis()
        trace = pm.sample(200000, step, start=start, progressbar=True)
        pm.traceplot(trace[burnin:], varnames=['alpha', 'mu'])

    # fig = plt.figure(figsize=(10, 6))
    # fig.add_subplot(211)
    # y_pred = trace[burnin:].get_values('y_pred')
    # plt.hist(y_pred, range=[0, x_lim],
    #          bins=x_lim, histtype='stepfilled', color=colors[1])
    # plt.xlim(1, x_lim)
    # plt.ylabel('Frequency')
    # plt.title('Posterior predictive distribution')

    # fig.add_subplot(212)
    # plt.hist(msg['time_delay_seconds'].values,
    #          range=[0, x_lim], bins=x_lim, histtype='stepfilled')
    # plt.xlabel('Response time in seconds')
    # plt.ylabel('Frequency')
    # plt.title('Distribution of observed data')
    # plt.tight_layout()
    # plt.show()
    return trace
Exemplo n.º 2
0
def test_HSStep_NegativeBinomial():
    np.random.seed(2032)
    M = 5
    N = 50
    X = np.random.normal(size=N * M).reshape((N, M))
    beta_true = np.array([1, 1, 2, 2, 0])
    y_nb = pm.NegativeBinomial.dist(np.exp(X.dot(beta_true)), 1).random()

    N_draws = 500
    with pm.Model():
        beta = HorseShoe("beta", tau=1, shape=M)
        pm.NegativeBinomial("y",
                            mu=at.exp(beta.dot(X.T)),
                            alpha=1,
                            observed=y_nb)
        hsstep = HSStep([beta])
        trace = pm.sample(
            draws=N_draws,
            step=hsstep,
            chains=1,
            return_inferencedata=True,
            compute_convergence_checks=False,
        )

    beta_samples = trace.posterior["beta"][0].values
    assert beta_samples.shape == (N_draws, M)
    np.testing.assert_allclose(beta_samples.mean(0), beta_true, atol=0.5)

    with pm.Model():
        beta = HorseShoe("beta", tau=1, shape=M, testval=beta_true * 0.1)
        pm.NegativeBinomial("y",
                            mu=beta.dot(np.abs(X.T)),
                            alpha=1,
                            observed=y_nb)
        hsstep = HSStep([beta])
        trace = pm.sample(
            draws=N_draws,
            step=hsstep,
            chains=1,
            return_inferencedata=True,
            compute_convergence_checks=False,
        )

    beta_samples = trace.posterior["beta"][0].values
    assert beta_samples.shape == (N_draws, M)

    with pm.Model():
        beta = HorseShoe("beta", tau=1, shape=M, testval=beta_true * 0.1)
        eta = pm.NegativeBinomial("eta", mu=beta.dot(X.T), alpha=1, shape=N)
        pm.Normal("y", mu=at.exp(eta), sigma=1, observed=y_nb)

        with pytest.raises(NotImplementedError):
            HSStep([beta])
Exemplo n.º 3
0
def testSepModels():
    indiv_traces = {}

    # convert categorical variables to integer
    le = preprocessing.LabelEncoder()
    participants_idx = le.fit_transform(msg['prev_sender'])
    # print('participants_idx:\n', participants_idx)
    participants = le.classes_
    print('participants:\n', participants)
    participants_num = len(participants)

    for p in participants:
        with pm.Model() as model:
            alpha = pm.Uniform('alpha', lower=0, upper=100)
            mu = pm.Uniform('mu', lower=0, upper=100)
            data = msg[msg['prev_sender'] == p]['time_delay_seconds'].values
            y_esti = pm.NegativeBinomial('y_esti',
                                         mu=mu,
                                         alpha=alpha,
                                         observed=data)
            y_pred = pm.NegativeBinomial('y_pred', mu=mu, alpha=alpha)
            start = pm.find_MAP()
            step = pm.Metropolis()
            trace = pm.sample(20000, step, start=start,
                              progressbar=True)  # sampling
            indiv_traces[p] = trace

    # visualize results
    # fig, axs = plt.subplots(3, 2, figsize=(12, 6))
    # axs = axs.ravel()  # obtain subplots
    # y_left_max = 2
    # y_right_max = 2000
    # x_lim = 60
    # ix = [3, 4, 6]  # selected samples

    # for i, j, p in zip([0, 1, 2], [0, 2, 4], participants[ix]):
    #     axs[j].set_title('Observed: %s' % p)
    #     axs[j].hist(msg[msg['prev_sender'] == p]['time_delay_seconds'].values,
    #                 range=[0, x_lim], bins=x_lim, histtype='stepfilled')
    #     axs[j].set_ylim([0, y_left_max])
    # for i, j, p in zip([0, 1, 2], [1, 3, 5], participants[ix]):
    #     axs[j].set_title('Posterior predictive distribution: %s' % p)
    #     axs[j].hist(indiv_traces[p].get_values('y_pred'),
    #                 range=[0, x_lim], bins=x_lim,
    #                 histtype='stepfilled', color=colors[1])
    #     axs[j].set_ylim([0, y_right_max])
    # axs[4].set_xlabel('Response time (seconds)')
    # axs[5].set_xlabel('Response time (seconds)')
    # plt.tight_layout()
    # plt.show()
    return indiv_traces
Exemplo n.º 4
0
def funcTrace24(path): # 'data/hr_day_cnctd.xlsx'
    
    import numpy as np    
    import pymc3 as pm
    import pandas as pd
    
    # When we want to understand the effect of more factors such as "day of week,"
    # "time of day," etc. We can use GLM (generalized linear models) to better
    # understand the effects of these factors.
    
    # Import Data
    data = pd.read_excel(path,  index_col='Index');
    
    #%% Houry NegativeBinomial Modeling
    # For each hour j and each EV connected i, we represent the model
    indiv_traces = {};
    
    # Convert categorical variables to integer
    hours = list(data.Hour)
    n_hours = len(hours)
    x_lim = 16
    
    print('---- Working -----')
    
    out_yPred = pd.DataFrame(np.zeros((x_lim,len(hours))), columns=list(hours))
    out_yObs = pd.DataFrame(np.zeros((x_lim,len(hours))), columns=list(hours))
    
    for h in hours:
        print('Hour: ', h)
        with pm.Model() as model:
            alpha = pm.Uniform('alpha', lower=0, upper=10)
            mu = pm.Uniform('mu', lower=0, upper=10)
    
            y_obs = data[data.Hour==h]['Connected'].values
            y_est = pm.NegativeBinomial('y_est', mu=mu, alpha=alpha, observed=y_obs)
    
            y_pred = pm.NegativeBinomial('y_pred', mu=mu, alpha=alpha)
    
            trace = pm.sample(10000, progressbar=True)
    
            indiv_traces[h] = trace
    
        out_yPred.loc[:,h], _ = np.histogram(indiv_traces[h].get_values('y_pred'), bins=x_lim)
        out_yObs.loc[:,h], _ = np.histogram(data[data.Hour==h]['Connected'].values, bins=x_lim)
    
    # Export results
    out_yPred.to_csv('out_yPred.csv')
    out_yObs.to_csv('out_yObs.csv')

    return(out_yPred, out_yObs)
Exemplo n.º 5
0
    def init_model(self, target):
        days, counties = target.index, target.columns

        # extract features
        features = self.evaluate_features(days, counties)
        Y_obs = target.stack().values.astype(np.float32)
        T_S = features["temporal_seasonal"].values.astype(np.float32)
        T_T = features["temporal_trend"].values.astype(np.float32)
        TS = features["spatiotemporal"].values.astype(np.float32)

        log_exposure = np.log(
            features["exposure"].values.astype(np.float32).ravel())

        # extract dimensions
        num_obs = np.prod(target.shape)
        num_t_s = T_S.shape[1]
        num_t_t = T_T.shape[1]
        num_ts = TS.shape[1]

        with pm.Model() as self.model:
            # interaction effects are generated externally -> flat prior
            IA = pm.Flat("IA", testval=np.ones(
                (num_obs, self.num_ia)), shape=(num_obs, self.num_ia))

            # priors
            # δ = 1/√α
            δ = pm.HalfCauchy("δ", 10, testval=1.0)
            α = pm.Deterministic("α", np.float32(1.0) / δ)
            W_ia = pm.Normal("W_ia", mu=0, sd=10, testval=np.zeros(
                self.num_ia), shape=self.num_ia)
            W_t_s = pm.Normal("W_t_s", mu=0, sd=10,
                              testval=np.zeros(num_t_s), shape=num_t_s)
            W_t_t = pm.Normal("W_t_t", mu=0, sd=10,
                              testval=np.zeros(num_t_t), shape=num_t_t)
            W_ts = pm.Normal("W_ts", mu=0, sd=10,
                             testval=np.zeros(num_ts), shape=num_ts)
            self.param_names = ["δ", "W_ia", "W_t_s", "W_t_t", "W_ts"]
            self.params = [δ, W_ia, W_t_s, W_t_t, W_ts]

            # calculate interaction effect
            IA_ef = tt.dot(tt.dot(IA, self.Q), W_ia)

            # calculate mean rates
            μ = pm.Deterministic(
                "μ",
                tt.exp(
                    IA_ef +
                    tt.dot(
                        T_S,
                        W_t_s) + 
                    tt.dot(
                        T_T,
                        W_t_t) +
                    tt.dot(
                        TS,
                        W_ts) +
                    log_exposure))

            # constrain to observations
            pm.NegativeBinomial("Y", mu=μ, alpha=α, observed=Y_obs)
Exemplo n.º 6
0
def test_HSStep_NegativeBinomial_sparse():
    np.random.seed(2032)
    M = 5
    N = 50
    X = np.random.normal(size=N * M).reshape((N, M))
    beta_true = np.array([1, 1, 2, 2, 0])
    y_nb = pm.NegativeBinomial.dist(np.exp(X.dot(beta_true)), 1).random()

    X = sp.sparse.csr_matrix(X)

    N_draws = 500
    with pm.Model():
        beta = HorseShoe("beta", tau=1, shape=M)
        pm.NegativeBinomial("y",
                            mu=at.exp(sp_dot(X, at.shape_padright(beta))),
                            alpha=1,
                            observed=y_nb)
        hsstep = HSStep([beta])
        trace = pm.sample(
            draws=N_draws,
            step=hsstep,
            chains=1,
            return_inferencedata=True,
            compute_convergence_checks=False,
        )

    beta_samples = trace.posterior["beta"][0].values
    assert beta_samples.shape == (N_draws, M)
    np.testing.assert_allclose(beta_samples.mean(0), beta_true, atol=0.5)
Exemplo n.º 7
0
    def _sample_pymc3(cls, dist, size, seed):
        """Sample from PyMC3."""

        import pymc3
        pymc3_rv_map = {
            'GeometricDistribution':
            lambda dist: pymc3.Geometric('X', p=float(dist.p)),
            'PoissonDistribution':
            lambda dist: pymc3.Poisson('X', mu=float(dist.lamda)),
            'NegativeBinomialDistribution':
            lambda dist: pymc3.NegativeBinomial('X',
                                                mu=float((dist.p * dist.r) /
                                                         (1 - dist.p)),
                                                alpha=float(dist.r))
        }

        dist_list = pymc3_rv_map.keys()

        if dist.__class__.__name__ not in dist_list:
            return None

        with pymc3.Model():
            pymc3_rv_map[dist.__class__.__name__](dist)
            return pymc3.sample(size,
                                chains=1,
                                progressbar=False,
                                random_seed=seed)[:]['X']
Exemplo n.º 8
0
 def add_observations():
     with hierarchical_model.pymc_model:
         for i in range(hierarchical_model.n_groups):
             observations.append(
                 pm.NegativeBinomial(f'y_{i}',
                                     mu=mu[i],
                                     alpha=alpha[i],
                                     observed=hierarchical_model.y[i]))
Exemplo n.º 9
0
    def phenom_model(self, method, field='deaths'):
        self.models[method] = {}
        for i, country in enumerate(self.countries):
            with pm.Model() as model:
                temp = self.data[(self.data.Country == country)].groupby(
                    ['time']).mean()[field].values
                print(country, temp[0])
                # TODO: Add if not external
                #np.argmax(temp)*1/3, # or add 0
                self.phenom_constrains = {
                    'c1m': 0.0000000000001,
                    'c1M': 10,
                    'c2m': np.argmax(temp) * 1 / 3,  # or add 0
                    'c2M': np.argmax(temp) * 3,
                    'c3m': np.max(temp),
                    'c3M': 50000
                }
                print('phenom_constrains: ', self.phenom_constrains)
                const = {}
                for cn in ['c1', 'c2', 'c3']:
                    const[cn] = pm.Uniform(cn,
                                           self.phenom_constrains[cn + 'm'],
                                           self.phenom_constrains[cn + 'M'])

                sigma = pm.HalfNormal('sigma', 100., shape=1)

                Nrepeat = 10
                T = np.arange(0, len(temp))
                T = np.append(T, np.repeat(T[-Nrepeat:], Nrepeat * 3))
                temp = np.append(temp, np.repeat(temp[-Nrepeat:], Nrepeat * 3))

                x = pm.Data("x", T)
                cases = pm.Data("y", temp)

                # Likelihood
                if method == 'log-model':
                    pm.NegativeBinomial(
                        country,
                        const['c3'] * (1 / (1 + np.exp(-(const['c1'] *
                                                         (x - const['c2']))))),
                        sigma,
                        observed=cases)

                if method == 'gompertz-model':
                    pm.Poisson(country,
                               const['c3'] *
                               np.exp(-np.exp(-const['c1'] *
                                              (x - const['c2']))),
                               observed=cases)

            self.models[method][country] = model
        return self.models
Exemplo n.º 10
0
def testPartialFusionModel():
    global msg
    with pm.Model() as model:
        # hyper paramers
        hyper_mu_mu = pm.Uniform('hyper_mu_mu', lower=0, upper=60)
        hyper_mu_sd = pm.Uniform('hyper_mu_sd', lower=0, upper=50)
        hyper_alpha_mu = pm.Uniform('hyper_alpha_mu', lower=0, upper=10)
        hyper_alpha_sd = pm.Uniform('hyper_alpha_sd', lower=0, upper=50)

        # participants
        le = preprocessing.LabelEncoder()
        participants_idx = le.fit_transform(msg['prev_sender'])
        participants = le.classes_
        parti_num = len(participants)

        # parameters
        mu = pm.Gamma('mu', mu=hyper_mu_mu, sd=hyper_mu_sd, shape=parti_num)
        alpha = pm.Gamma('alpha',
                         mu=hyper_alpha_mu,
                         sd=hyper_alpha_sd,
                         shape=parti_num)

        # sampling
        y_esti = pm.NegativeBinomial('y_esti',
                                     mu=mu[participants_idx],
                                     alpha=alpha[participants_idx],
                                     observed=msg['time_delay_seconds'].values)
        y_pred = pm.NegativeBinomial('y_pred',
                                     mu=mu[participants_idx],
                                     alpha=alpha[participants_idx],
                                     shape=msg['prev_sender'].shape)
        start = pm.find_MAP()
        step = pm.Metropolis()
        hierarchical_trace = pm.sample(200000, step, progressbar=True)
        pm.traceplot(hierarchical_trace[120000:],
                     varnames=[
                         'mu', 'alpha', 'hyper_mu_mu', 'hyper_mu_sd',
                         'hyper_alpha_mu', 'hyper_alpha_sd'
                     ])
Exemplo n.º 11
0
    def phenom_model(self, method, field = 'deaths'):
        for i, country in enumerate(self.countries):
            self.models = {} 
            self.models[method] = {}
            with pm.Model() as model:
                print('phenom_constrains:', self.phenom_constrains,'\n')

                const = {}
                for cn in ['c1','c2','c3']:
                    grp = pm.Normal(cn+'grp', self.phenom_constrains[cn+'M'], self.phenom_constrains[cn+'s'])
                    # Group variance
                    grp_sigma = pm.HalfNormal(cn+'grp_sigma', self.phenom_constrains[cn+'s'])
                    # Individual intercepts
                    const[cn] = pm.Normal(cn,  mu=grp, sigma=grp_sigma,  shape=len(self.countries))

                sigma = pm.HalfNormal('sigma', 10000., shape=len(self.countries))

                temp = self.data[self.data['Country'] == country][field].values
                x = pm.Data("x",  np.arange(0, len(temp)))
                cases = pm.Data("y",  temp)

                # Likelihood
                if method == 'log-model':
                    pm.NegativeBinomial(
                        country, 
                        const['c3'][i]*(1/(1 + np.exp(-(const['c1'][i] * (-const['c2'][i] + x))))),
                        sigma[i], 
                        observed=cases)
                    
                if method == 'gompertz-model':
                    pm.NegativeBinomial(
                        country, 
                        const['c3'][i]*np.exp(-np.exp(-const['c1'][i]*(x-const['c2'][i]))),
                        sigma[i], 
                        observed=cases)
            self.models[method][country] = model
        return models
def get_model(dist, data) -> pm.Model:
    means = data.mean(0)
    n_exp = data.shape[1]
    if dist == "Poisson":
        with pm.Model() as poi_model:
            lam = pm.Exponential("lam", lam=means, shape=(1, n_exp))
            poi = pm.Poisson(
                "poi",
                mu=lam,
                observed=data,
            )
        return poi_model
    if dist == "ZeroInflatedPoisson":
        with pm.Model() as zip_model:
            psi = pm.Uniform("psi", shape=(1, n_exp))
            lam = pm.Exponential("lam", lam=means, shape=(1, n_exp))
            zip = pm.ZeroInflatedPoisson(
                "zip",
                psi=psi,
                theta=lam,
                observed=data,
            )
        return zip_model
    if dist == "NegativeBinomial":
        with pm.Model() as nb_model:
            gamma = pm.Gamma("gm", 0.01, 0.01, shape=(1, n_exp))
            lam = pm.Exponential("lam", lam=means, shape=(1, n_exp))
            nb = pm.NegativeBinomial(
                "nb",
                alpha=gamma,
                mu=lam,
                observed=data,
            )
        return nb_model
    if dist == "ZeroInflatedNegativeBinomial":
        with pm.Model() as zinb_model:
            gamma = pm.Gamma("gm", 0.01, 0.01, shape=(1, n_exp))
            lam = pm.Exponential("lam", lam=means, shape=(1, n_exp))
            psi = pm.Uniform("psi", shape=(1, n_exp))
            zinb = pm.ZeroInflatedNegativeBinomial(
                "zinb",
                psi=psi,
                alpha=gamma,
                mu=lam,
                observed=data,
            )
        return zinb_model
Exemplo n.º 13
0
    def nb_model(self, N=1000, tune=1000):
        dat = self.data
        mu = self.mu
        alpha = self.alpha

        dat = np.asarray(dat)
        dat[dat > 10] = 0
        dat = dat[dat > 0]
        print(np.max(dat))
        with pm.Model() as model_n:
            mu = pm.Uniform('mu', lower=0, upper=mu)
            alpha = pm.Uniform('alpha', lower=0, upper=alpha)
            # y_pred = pm.NegativeBinomial('y_pred', mu=mu, alpha=alpha)
            y_est = pm.NegativeBinomial('y_est',
                                        mu=mu,
                                        alpha=alpha,
                                        observed=dat)
            trace_n = pm.sample(N, tune=tune, cores=2)

        return trace_n
Exemplo n.º 14
0
def mcmcNegativeBinomial(data):
    """Generate a trace for the data"""
    with pm.Model() as model:
        # Not familiar with Negative Binomial, so no prior knowledge, let's choose uniform as a prior
        # To be safe, make sure the possible range is larger than needed.
        alpha_rv = pm.Uniform('alpha_rv', 0.0, 3.0)
        mu_rv = pm.Uniform('mu_rv', 0.1, 30.0)
        score_rv = pm.NegativeBinomial('score_rv',
                                       mu=mu_rv,
                                       alpha=alpha_rv,
                                       observed=data)
        step = pm.NUTS()
        trace = pm.sample(step=step,
                          draws=10000,
                          chains=4,
                          cores=4,
                          init='adapt_diag')
        graph = pm.model_to_graphviz(model)
    graph.render(filename='model', format='png')
    return trace
Exemplo n.º 15
0
def visualize_trace(trace, data, desc='2007'):
    """Interpret the trace"""
    print(f"Visualize the probability distribution of two parameters.")
    alphas = trace.get_values('alpha_rv')
    mus = trace.get_values('mu_rv')
    fig, ax = plt.subplots(figsize=[9, 6], nrows=2)
    ax[0].hist(alphas, bins='auto', density=True)
    ax[0].set_title(f"Probability Distribution of Beta ({desc})")
    ax[1].hist(mus, bins='auto', density=True)
    ax[1].set_title(f"Probability Distribution of q ({desc})")
    plt.tight_layout()
    plt.show()
    print(
        f"Reconstruct the PHQ score distribution using mean value and compare with original data too see the fitness."
    )
    mu_mean = np.mean(mus)
    alpha_mean = np.mean(alphas)
    with pm.Model() as model:
        score_rv = pm.NegativeBinomial('score_rv',
                                       mu=mu_mean,
                                       alpha=alpha_mean)
        x = score_rv.random(size=10000)
    #HACK: I don't know how to bound the model, so what I can do is cut off the tail after getting the data.
    #      However, there's little data that is larger than the boundary, luckily.
    x = x[x <= 27]
    plot_ecdf([x, data],
              labels=[
                  f"Random Data (mu={mu_mean:.3f}, alpha={alpha_mean:.3f})",
                  desc
              ],
              alphas=[1, 0.9])
    plot_hist([x, data],
              bins=[27, 27],
              labels=[
                  f"Random Data (mu={mu_mean:.3f}, alpha={alpha_mean:.3f})",
                  desc
              ],
              alphas=[1, 0.5])
    return alpha_mean, mu_mean
Exemplo n.º 16
0
def test_set_initval():
    # Make sure the dependencies between variables are maintained when
    # generating initial values
    rng = np.random.RandomState(392)

    with pm.Model(rng_seeder=rng) as model:
        eta = pm.Uniform("eta", 1.0, 2.0, size=(1, 1))
        mu = pm.Normal("mu", sd=eta, initval=[[100]])
        alpha = pm.HalfNormal("alpha", initval=100)
        value = pm.NegativeBinomial("value", mu=mu, alpha=alpha)

    assert np.array_equal(model.initial_values[model.rvs_to_values[mu]], np.array([[100.0]]))
    np.testing.assert_almost_equal(model.initial_values[model.rvs_to_values[alpha]], np.log(100))
    assert 50 < model.initial_values[model.rvs_to_values[value]] < 150

    # `Flat` cannot be sampled, so let's make sure that doesn't break initial
    # value computations
    with pm.Model() as model:
        x = pm.Flat("x")
        y = pm.Normal("y", x, 1)

    assert model.rvs_to_values[y] in model.initial_values
Exemplo n.º 17
0
def run_mcmc(
    df,
    country="US",
    days_in_future=50,
    logy=True,
    totalPop=7e9,
    tune=5000,
    draws=1200,
):
    dates = df.index
    y = by_country.loc[:, country].values
    x = (dates - np.datetime64(dates[0])).days
    xplot = np.arange(x[-1] + days_in_future)

    p0 = np.log([2.3, 46, 2000])
    x0, cov = curve_fit(logistic_model, x, y, p0=p0, maxfev=10000)

    with pm.Model() as model:

        def logistic_cdf(x, la, lb, lc):
            a, b, c = la, tt.exp(lb), tt.exp(lc)
            return c / (1 + tt.exp(-(x - b) / a))

        # growthBound = pm.Bound(pm.Normal, lower=0)
        # loga = growthBound("loga", mu=tt.log(5), sd=3)
        growthBound = pm.Bound(pm.Gamma, lower=1)
        a = growthBound("loga", alpha=3.5, beta=1)

        logb = pm.Normal("logb", mu=tt.log(150), sd=3)

        popBound = pm.Bound(
            pm.Normal, upper=tt.log(totalPop), lower=tt.log(y[-1])
        )
        logc = popBound("logc", mu=np.log(0.1 * totalPop), sd=5)

        # switching to an InvGamma prior on sd, cos its the conjugate
        # prior of the normal distrbution with unknown sd

        # logsd = pm.Normal("logsd", mu=2, sd=2)
        mask = y > 50
        sd = pm.InverseGamma(
            "logsd",
            mu=np.std(y[mask] / x[mask]),
            sd=np.std(y[mask] / x[mask]) / len(x[mask]),
        )

        mod = logistic_cdf(x.values[mask], a, logb, logc)

        # pm.Normal("obs", mu=mod, sd=sd, observed=y[mask])
        # move to Negative Binomial
        pm.obs = pm.NegativeBinomial('obs', mod, sd, observed=y[mask])

        mod_eval = pm.Deterministic(
            "mod_eval", logistic_cdf(xplot, a, logb, logc)
        )

        map_params = optimize()

        trace = pm.sample(
            draws=draws,
            tune=tune,
            chains=2,
            cores=2,
            start=map_params,
            target_accept=0.9,
            progressbar=False,
        )

    q = np.percentile(trace["mod_eval"], q=[50, 90, 10], axis=0)

    if logy:
        p = plotting.figure(y_axis_type="log", x_axis_type="datetime")
        p.yaxis.formatter = FuncTickFormatter(code=code)
    else:
        p = plotting.figure(y_axis_type="linear", x_axis_type="datetime")

    # ln = p.line(
    #     [dates[0] + datetime.timedelta(days=x) for x in range(0, xplot[-1])],
    #     q[0],
    #     line_width=2,
    # )
    ln = p.line(
        [dates[0] + datetime.timedelta(days=x) for x in range(0, xplot[-1])],
        np.mean(trace["mod_eval"], axis=0),
        line_width=2,
    )
    p.line(
        [dates[0] + datetime.timedelta(days=x) for x in range(0, xplot[-1])],
        q[1],
        line_dash="dashed",
        line_width=1,
    )
    p.line(
        [dates[0] + datetime.timedelta(days=x) for x in range(0, xplot[-1])],
        q[2],
        line_dash="dashed",
        line_width=1,
    )
    p.circle(dates, y, color=colors[1])
    p.y_range = Range1d(10, 1.2 * np.max(q[1]))
    p.yaxis.formatter = FuncTickFormatter(code=code)

    legend_it = [(country, [ln])]
    legend = Legend(
        items=legend_it, location="top_right", orientation="horizontal"
    )
    legend.spacing = 17
    legend.click_policy = "hide"
    p.add_layout(legend, "above")

    label_opts = dict(
        x=dates[0] + datetime.timedelta(days=int(xplot[-1])),
        y=np.max(q[1]) * 1.1,
        text_align="right",
        text_font_size="9pt",
    )

    caption = Label(
        text=f'Created by Tom Barclay on {datetime.datetime.now().strftime("%b %d, %Y")}',
        **label_opts,
    )

    p.add_layout(caption, "below")

    script, div = components(p)
    embedfile = (
        f"_includes/{country.replace(' ', '')}_infections_mcmc_embed.html"
    )
    with open(embedfile, "w") as ff:
        ff.write(div)
        ff.write(script)

    return [
        f'{(dates[0] + datetime.timedelta(days=np.mean(np.exp(trace["logb"])))).strftime("%b %d, %Y")}',
        [
            np.mean(np.exp(trace["logc"])),
            *np.percentile(np.exp(trace["logc"]), [90, 10]),
        ],
    ]
    def __init__(
        self,
        cell_state_mat: np.ndarray,
        X_data: np.ndarray,
        n_comb: int = 50,
        data_type: str = "float32",
        n_iter=20000,
        learning_rate=0.005,
        total_grad_norm_constraint=200,
        verbose=True,
        var_names=None,
        var_names_read=None,
        obs_names=None,
        fact_names=None,
        sample_id=None,
        gene_level_prior={
            "mean": 1 / 2,
            "sd": 1 / 4
        },
        gene_level_var_prior={"mean_var_ratio": 1.0},
        cell_number_prior={
            "cells_per_spot": 8.0,
            "factors_per_spot": 7.0,
            "combs_per_spot": 2.5
        },
        cell_number_var_prior={
            "cells_mean_var_ratio": 1.0,
            "factors_mean_var_ratio": 1.0,
            "combs_mean_var_ratio": 1.0
        },
        phi_hyp_prior={
            "mean": 3.0,
            "sd": 1.0
        },
        spot_fact_mean_var_ratio=5.0,
        exper_gene_level_mean_var_ratio=10,
    ):

        ############# Initialise parameters ################
        super().__init__(
            cell_state_mat,
            X_data,
            data_type,
            n_iter,
            learning_rate,
            total_grad_norm_constraint,
            verbose,
            var_names,
            var_names_read,
            obs_names,
            fact_names,
            sample_id,
        )

        for k in gene_level_var_prior.keys():
            gene_level_prior[k] = gene_level_var_prior[k]

        self.gene_level_prior = gene_level_prior
        self.phi_hyp_prior = phi_hyp_prior
        self.n_comb = n_comb
        self.spot_fact_mean_var_ratio = spot_fact_mean_var_ratio
        self.exper_gene_level_mean_var_ratio = exper_gene_level_mean_var_ratio

        # generate parameters for samples
        self.spot2sample_df = pd.get_dummies(sample_id)
        # convert to np.ndarray
        self.spot2sample_mat = self.spot2sample_df.values
        self.n_exper = self.spot2sample_mat.shape[1]
        # assign extra data to dictionary with (1) shared parameters (2) input data
        self.extra_data_tt = {
            "spot2sample":
            theano.shared(self.spot2sample_mat.astype(self.data_type))
        }
        self.extra_data = {
            "spot2sample": self.spot2sample_mat.astype(self.data_type)
        }

        cell_number_prior["factors_per_combs"] = (
            cell_number_prior["factors_per_spot"] /
            cell_number_prior["combs_per_spot"])
        for k in cell_number_var_prior.keys():
            cell_number_prior[k] = cell_number_var_prior[k]
        self.cell_number_prior = cell_number_prior

        ############# Define the model ################
        self.model = pm.Model()

        with self.model:

            # =====================Gene expression level scaling======================= #
            # Explains difference in expression between genes and
            # how it differs in single cell and spatial technology
            # compute hyperparameters from mean and sd
            shape = gene_level_prior["mean"]**2 / gene_level_prior["sd"]**2
            rate = gene_level_prior["mean"] / gene_level_prior["sd"]**2
            shape_var = shape / gene_level_prior["mean_var_ratio"]
            rate_var = rate / gene_level_prior["mean_var_ratio"]
            self.gene_level_alpha_hyp = pm.Gamma("gene_level_alpha_hyp",
                                                 mu=shape,
                                                 sigma=np.sqrt(shape_var),
                                                 shape=(1, 1))
            self.gene_level_beta_hyp = pm.Gamma("gene_level_beta_hyp",
                                                mu=rate,
                                                sigma=np.sqrt(rate_var),
                                                shape=(1, 1))

            # global gene levels
            self.gene_level = pm.Gamma("gene_level",
                                       self.gene_level_alpha_hyp,
                                       self.gene_level_beta_hyp,
                                       shape=(self.n_var, 1))
            # scale cell state factors by gene_level
            self.gene_factors = pm.Deterministic("gene_factors",
                                                 self.cell_state)
            # self.gene_factors = self.cell_state
            # tt.printing.Print('gene_factors sum')(gene_factors.sum(0).shape)
            # tt.printing.Print('gene_factors sum')(gene_factors.sum(0))

            # =====================Spot factors======================= #
            # prior on spot factors reflects the number of cells, fraction of their cytoplasm captured,
            # times heterogeniety in the total number of mRNA between individual cells with each cell type
            self.cells_per_spot = pm.Gamma(
                "cells_per_spot",
                mu=cell_number_prior["cells_per_spot"],
                sigma=np.sqrt(cell_number_prior["cells_per_spot"] /
                              cell_number_prior["cells_mean_var_ratio"]),
                shape=(self.n_obs, 1),
            )
            self.comb_per_spot = pm.Gamma(
                "combs_per_spot",
                mu=cell_number_prior["combs_per_spot"],
                sigma=np.sqrt(cell_number_prior["combs_per_spot"] /
                              cell_number_prior["combs_mean_var_ratio"]),
                shape=(self.n_obs, 1),
            )

            shape = self.comb_per_spot / np.array(self.n_comb).reshape((1, 1))
            rate = tt.ones((1, 1)) / self.cells_per_spot * self.comb_per_spot
            self.combs_factors = pm.Gamma("combs_factors",
                                          alpha=shape,
                                          beta=rate,
                                          shape=(self.n_obs, self.n_comb))

            self.factors_per_combs = pm.Gamma(
                "factors_per_combs",
                mu=cell_number_prior["factors_per_combs"],
                sigma=np.sqrt(cell_number_prior["factors_per_combs"] /
                              cell_number_prior["factors_mean_var_ratio"]),
                shape=(self.n_comb, 1),
            )
            c2f_shape = self.factors_per_combs / np.array(self.n_fact).reshape(
                (1, 1))
            self.comb2fact = pm.Gamma("comb2fact",
                                      alpha=c2f_shape,
                                      beta=self.factors_per_combs,
                                      shape=(self.n_comb, self.n_fact))

            self.spot_factors = pm.Gamma(
                "spot_factors",
                mu=pm.math.dot(self.combs_factors, self.comb2fact),
                sigma=pm.math.sqrt(
                    pm.math.dot(self.combs_factors, self.comb2fact) /
                    self.spot_fact_mean_var_ratio),
                shape=(self.n_obs, self.n_fact),
            )

            # =====================Spot-specific additive component======================= #
            # molecule contribution that cannot be explained by cell state signatures
            # these counts are distributed between all genes not just expressed genes
            self.spot_add_hyp = pm.Gamma("spot_add_hyp", 1, 1, shape=2)
            self.spot_add = pm.Gamma("spot_add",
                                     self.spot_add_hyp[0],
                                     self.spot_add_hyp[1],
                                     shape=(self.n_obs, 1))

            # =====================Gene-specific additive component ======================= #
            # per gene molecule contribution that cannot be explained by cell state signatures
            # these counts are distributed equally between all spots (e.g. background, free-floating RNA)
            self.gene_add_hyp = pm.Gamma("gene_add_hyp", 1, 1, shape=2)
            self.gene_add = pm.Gamma("gene_add",
                                     self.gene_add_hyp[0],
                                     self.gene_add_hyp[1],
                                     shape=(self.n_exper, self.n_var))

            # =====================Gene-specific overdispersion ======================= #
            self.phi_hyp = pm.Gamma("phi_hyp",
                                    mu=phi_hyp_prior["mean"],
                                    sigma=phi_hyp_prior["sd"],
                                    shape=(1, 1))
            self.gene_E = pm.Exponential("gene_E",
                                         self.phi_hyp,
                                         shape=(self.n_exper, self.n_var))

            # =====================Expected expression ======================= #
            # expected expression
            self.mu_biol = (
                pm.math.dot(self.spot_factors, self.gene_factors.T) *
                self.gene_level.T +
                pm.math.dot(self.extra_data_tt["spot2sample"], self.gene_add) +
                self.spot_add)
            # tt.printing.Print('mu_biol')(self.mu_biol.shape)

            # =====================DATA likelihood ======================= #
            # Likelihood (sampling distribution) of observations & add overdispersion via NegativeBinomial / Poisson
            self.data_target = pm.NegativeBinomial(
                "data_target",
                mu=self.mu_biol,
                alpha=pm.math.dot(self.extra_data_tt["spot2sample"],
                                  1 / tt.pow(self.gene_E, 2)),
                observed=self.x_data,
                total_size=self.X_data.shape,
            )

            # =====================Compute nUMI from each factor in spots  ======================= #
            self.nUMI_factors = pm.Deterministic(
                "nUMI_factors", (self.spot_factors *
                                 (self.gene_factors * self.gene_level).sum(0)))
Exemplo n.º 19
0
    # mu = pm.Uniform('mu', 0, 10)
    beta = pm.Normal('beta', 0, 20, shape=companiesABC)
    beta1 = pm.Normal('beta1', 0, 20, shape=companiesABC)
    beta2 = pm.Normal('beta2', 0, 10)
    # theta = pm.Uniform('theta', lower=0, upper=10)
    muu = tt.printing.Print('beta2')(beta2)
    mu = pm.Deterministic(
        'mu',
        tt.exp(beta[companyABC] + beta1[companyABC] * elec_year1 +
               beta2 * elec_tem1))
    # mu = tt.exp(beta + beta1 * elec_year + beta2 * elec_tem)
    # mu = pm.math.exp(theta)
    # Observed_pred = pm.NegativeBinomial("Observed_pred", mu=mu, alpha=sigma, shape=elec_faults.shape)  # 观测值
    Observed = pm.NegativeBinomial("Observed",
                                   mu=mu,
                                   alpha=sigma,
                                   observed=elec_faults1)  # 观测值

    start = pm.find_MAP()
    # step1 = pm.Slice([beta, beta1, beta2])
    # step = pm.Metropolis()
    trace2 = pm.sample(2000, start=start, tune=1000)
chain2 = trace2
varnames1 = ['beta', 'beta1', 'beta2', 'sigma', 'mu']
varnames2 = ['beta', 'beta1', 'beta2', 'sigma']
pm.plot_posterior(chain2, varnames1)
plt.show()

map_estimate = pm.find_MAP(model=unpooled_model)
print(map_estimate)
Exemplo n.º 20
0
    def build(self):
        """ Builds and returns the Generative model. Also sets self.model """

        p_delay = get_delay_distribution()
        nonzero_days = self.observed.total.gt(0)
        len_observed = len(self.observed)
        convolution_ready_gt = self._get_convolution_ready_gt(len_observed)
        x = np.arange(len_observed)[:, None]

        coords = {
            "date": self.observed.index.values,
            "nonzero_date":
            self.observed.index.values[self.observed.total.gt(0)],
        }
        with pm.Model(coords=coords) as self.model:

            # Let log_r_t walk randomly with a fixed prior of ~0.035. Think
            # of this number as how quickly r_t can react.
            log_r_t = pm.GaussianRandomWalk("log_r_t",
                                            sigma=0.035,
                                            dims=["date"])
            r_t = pm.Deterministic("r_t", pm.math.exp(log_r_t), dims=["date"])

            # For a given seed population and R_t curve, we calculate the
            # implied infection curve by simulating an outbreak. While this may
            # look daunting, it's simply a way to recreate the outbreak
            # simulation math inside the model:
            # https://staff.math.su.se/hoehle/blog/2020/04/15/effectiveR0.html
            seed = pm.Exponential("seed", 1 / 0.02)
            y0 = tt.zeros(len_observed)
            y0 = tt.set_subtensor(y0[0], seed)
            outputs, _ = theano.scan(
                fn=lambda t, gt, y, r_t: tt.set_subtensor(
                    y[t], tt.sum(r_t * y * gt)),
                sequences=[tt.arange(1, len_observed), convolution_ready_gt],
                outputs_info=y0,
                non_sequences=r_t,
                n_steps=len_observed - 1,
            )
            infections = pm.Deterministic("infections",
                                          outputs[-1],
                                          dims=["date"])

            # Convolve infections to confirmed positive reports based on a known
            # p_delay distribution. See patients.py for details on how we calculate
            # this distribution.
            test_adjusted_positive = pm.Deterministic(
                "test_adjusted_positive",
                conv2d(
                    tt.reshape(infections, (1, len_observed)),
                    tt.reshape(p_delay, (1, len(p_delay))),
                    border_mode="full",
                )[0, :len_observed],
                dims=["date"])

            # Picking an exposure with a prior that exposure never goes below
            # 0.1 * max_tests. The 0.1 only affects early values of Rt when
            # testing was minimal or when data errors cause underreporting
            # of tests.
            tests = pm.Data("tests", self.observed.total.values, dims=["date"])
            exposure = pm.Deterministic("exposure",
                                        pm.math.clip(
                                            tests,
                                            self.observed.total.max() * 0.1,
                                            1e9),
                                        dims=["date"])

            # Test-volume adjust reported cases based on an assumed exposure
            # Note: this is similar to the exposure parameter in a Poisson
            # regression.
            positive = pm.Deterministic("positive",
                                        exposure * test_adjusted_positive,
                                        dims=["date"])

            # Save data as part of trace so we can access in inference_data
            observed_positive = pm.Data("observed_positive",
                                        self.observed.positive.values,
                                        dims=["date"])
            nonzero_observed_positive = pm.Data(
                "nonzero_observed_positive",
                self.observed.positive[nonzero_days.values].values,
                dims=["nonzero_date"])

            positive_nonzero = pm.NegativeBinomial(
                "nonzero_positive",
                mu=positive[nonzero_days.values],
                alpha=pm.Gamma("alpha", mu=6, sigma=1),
                observed=nonzero_observed_positive,
                dims=["nonzero_date"])

        return self.model
Exemplo n.º 21
0
def build_model(
    observed: pandas.DataFrame,
    p_generation_time: numpy.ndarray,
    p_delay: numpy.ndarray,
    test_col: str,
    buffer_days: int = 10,
    pmodel: typing.Optional[pymc3.Model] = None,
) -> pymc3.Model:
    """ Builds the Rt.live PyMC3 model.

    Model by Kevin Systrom, Thomas Vladek and Rtlive contributors.

    Parameters
    ----------
    observed : pandas.DataFrame
        date-indexed dataframe with column "new_cases" (daily positives) 
        and a column of daily tests whose name is specified by parameter [test_col]
    p_generation_time : numpy.ndarray
        numpy array that describes the generation time distribution
    p_delay : numpy.ndarray
        numpy array that describes the testing delay distribution
    test_col : str
        name of column with daily new tests (predicted or actual data)
    buffer_days : int
        number of days to prepend before the beginning of the data
    pmodel : optional, PyMC3 model
        an existing PyMC3 model object to use (not context-activated)

    Returns
    -------
    pmodel : pymc3.Model
        the (created) PyMC3 model
    """
    observed = observed.rename(columns={test_col: "daily_tests"})
    # Reindex to make sure that there are no gaps.
    # Also add (unobserved) buffer days at the beginning.
    observed = _reindex_observed(observed, buffer_days)

    # make boolean masks to filter for dates that have case data, testcount data or both
    has_cases = ~numpy.isnan(observed.new_cases).values
    has_testcounts = ~numpy.isnan(observed.daily_tests).values
    has_data = has_cases & has_testcounts
    # masks that can be used w.r.t. subsets of the dates.
    # These are used to slice tensors that are already shorter than the full length.
    has_data_wrt_cases = has_data[has_cases]
    has_data_wrt_testcounts = has_data[has_testcounts]

    coords = {
        # this is the full lenght of dates (without gaps) covered by the generative part of the model
        "date": observed.index.values,
        # these are subsets of dates where case/testcount data is available
        "date_with_cases": observed.index.values[has_cases],
        "date_with_testcounts": observed.index.values[has_testcounts],
        # and the dates with both case & testcount data (for the likelihood)
        "date_with_data": observed.index.values[has_data],
    }
    N_dates = len(coords["date"])
    N_with_cases = len(coords["date_with_cases"])
    N_with_testcounts = len(coords["date_with_testcounts"])
    N_with_data = len(coords["date_with_data"])
    _log.info(
        "The model describes %i days of which %i have case data and %i have testcount data. %i days have both.",
        N_dates, N_with_cases, N_with_testcounts, N_with_data)

    if not pmodel:
        pmodel = pymc3.Model(coords=coords)

    with pmodel:
        # Let log_r_t walk randomly with a fixed prior of ~0.035. Think
        # of this number as how quickly r_t can react.
        log_r_t = pymc3.GaussianRandomWalk("log_r_t",
                                           sigma=0.035,
                                           dims=["date"])
        r_t = pymc3.Deterministic("r_t",
                                  pymc3.math.exp(log_r_t),
                                  dims=["date"])

        # Save data as part of trace so we can access in inference_data
        t_generation_time = pymc3.Data("p_generation_time", p_generation_time)
        # precompute generation time interval vector to speed up tt.scan
        convolution_ready_gt = _to_convolution_ready_gt(
            p_generation_time, N_dates)
        # For a given seed population and R_t curve, we calculate the
        # implied infection curve by simulating an outbreak. While this may
        # look daunting, it's simply a way to recreate the outbreak
        # simulation math inside the model:
        # https://staff.math.su.se/hoehle/blog/2020/04/15/effectiveR0.html
        seed = pymc3.Exponential("seed", 1 / 0.02)
        y0 = tt.zeros(N_dates)
        y0 = tt.set_subtensor(y0[0], seed)
        outputs, _ = theano.scan(
            fn=lambda t, gt, y, r_t: tt.set_subtensor(y[t], tt.sum(r_t * y * gt
                                                                   )),
            sequences=[tt.arange(1, N_dates), convolution_ready_gt],
            outputs_info=y0,
            non_sequences=r_t,
            n_steps=N_dates - 1,
        )
        infections = pymc3.Deterministic("infections",
                                         outputs[-1],
                                         dims=["date"])

        t_p_delay = pymc3.Data("p_delay", p_delay)
        # Convolve infections to confirmed positive reports based on a known
        # p_delay distribution. See patients.py for details on how we calculate
        # this distribution.
        test_adjusted_positive = pymc3.Deterministic(
            "test_adjusted_positive",
            theano.tensor.signal.conv.conv2d(
                tt.reshape(infections, (1, N_dates)),
                tt.reshape(t_p_delay, (1, len(p_delay))),
                border_mode="full",
            )[0, :N_dates],
            dims=["date"])

        # Picking an exposure with a prior that exposure never goes below
        # 0.1 * max_tests. The 0.1 only affects early values of Rt when
        # testing was minimal or when data errors cause underreporting
        # of tests.
        tests = pymc3.Data("tests",
                           observed.daily_tests[has_testcounts],
                           dims=["date_with_testcounts"])
        exposure = pymc3.Deterministic("exposure",
                                       pymc3.math.clip(
                                           tests,
                                           observed.daily_tests.max() * 0.1,
                                           1e9),
                                       dims=["date_with_testcounts"])

        # Test-volume adjust reported cases based on an assumed exposure
        # Note: this is similar to the exposure parameter in a Poisson
        # regression.
        positive = pymc3.Deterministic("positive",
                                       exposure *
                                       test_adjusted_positive[has_testcounts],
                                       dims=["date_with_testcounts"])
        positive_where_data = pymc3.Deterministic(
            "positive_where_data",
            positive[has_data_wrt_testcounts],
            dims=["date_with_data"])

        observed_positive = pymc3.Data("observed_positive",
                                       observed.new_cases[has_cases],
                                       dims=["date_with_cases"])
        observed_positive_where_data = pymc3.Data(
            "observed_positive_where_data",
            observed.new_cases[has_cases][has_data_wrt_cases],
            dims=["date_with_data"])

        likelihood = pymc3.NegativeBinomial(
            "likelihood",
            mu=positive_where_data,
            alpha=pymc3.Gamma("alpha", mu=6, sigma=1),
            observed=observed_positive_where_data,
            dims=["date_with_data"])
    return pmodel
Exemplo n.º 22
0
def run(region, folder, load_trace=False, compute_sim=True, plot_posterior_dist = True):

    print("started ... " + region)

    if not os.path.exists(region):
        os.makedirs(region)

    # observed data
    (t_obs, datetimes, y_obs, n_pop, shutdown_day, u0, _) = data_fetcher.read_region_data(folder, region)
    y_obs = y_obs.astype(np.float64)
    u0 = u0.astype(np.float64)

    # set eqn
    eqn = Seir()
    eqn.population = n_pop
    eqn.tau = shutdown_day
    
    # set ode solver
    ti = t_obs[0]
    tf = t_obs[-1]
    m = 2
    n_steps = m*(tf - ti)
    rk = RKSolverSeir(ti, tf, n_steps)
    rk.rk_type = "explicit_euler"
    rk.output_frequency = m
    rk.set_output_storing_flag(True)
    rk.equation = eqn
    
    du0_dp = np.zeros((eqn.n_components(), eqn.n_parameters()))
    rk.set_initial_condition(u0, du0_dp)
    rk.set_output_gradient_flag(True)

    # sample posterior
    with pm.Model() as model:
        # set prior distributions
        #beta  = pm.Lognormal('beta',  mu = math.log(0.4/n_pop), sigma = 0.4)
        #sigma = pm.Lognormal('sigma', mu = math.log(0.3), sigma = 0.5)
        #gamma = pm.Lognormal('gamma', mu = math.log(0.25), sigma = 0.5)
        #kappa = pm.Lognormal('kappa', mu = math.log(0.1), sigma = 0.5)

        #beta  = pm.Normal('beta',  mu = 0.4/n_pop, sigma = 0.06/n_pop)
        #sigma = pm.Normal('sigma', mu = 0.6, sigma = 0.1)
        #gamma = pm.Normal('gamma', mu = 0.3, sigma = 0.07)
        #kappa = pm.Normal('kappa', mu = 0.5, sigma = 0.1)
        #tint = pm.Lognormal('tint', mu = math.log(30), sigma = 1)

        beta  = pm.Lognormal('beta',  mu = math.log(0.1), sigma = 0.5) #math.log(0.3/n_pop), sigma = 0.5)
        sigma = pm.Lognormal('sigma', mu = math.log(0.05), sigma = 0.6)
        gamma = pm.Lognormal('gamma', mu = math.log(0.05), sigma = 0.6)
        kappa = pm.Lognormal('kappa', mu = math.log(0.2), sigma = 0.3) # math.log(0.001), sigma = 0.8)
        tint = pm.Lognormal('tint', mu = math.log(30), sigma = math.log(10))
        dispersion = pm.Normal('dispersion', mu = 30., sigma = 10.)

        # set cached_sim object
        cached_sim = CachedSEIRSimulation(rk)
    
        # set theano model op object
        model = ModelOp(cached_sim)
    
        # set likelihood distribution
        y_sim = pm.NegativeBinomial('y_sim', mu=model((beta, sigma, gamma, kappa, tint)), alpha=dispersion, observed=y_obs)
        
        if not load_trace:
            # sample posterior distribution and save trace
            draws = 1000 #1000
            tune = 500 #500
            trace = pm.sample(draws=draws, tune=tune, cores=4, chains=4, nuts_kwargs=dict(target_accept=0.9), init='advi+adapt_diag') # using NUTS sampling
            # save trace
            pm.backends.text.dump(region + os.path.sep, trace)
        else:
            # load trace
            trace = pm.backends.text.load(region + os.path.sep)     
        
        if plot_posterior_dist:
        # plot posterior distributions of all parameters
            data = az.from_pymc3(trace=trace)
            pm.plots.traceplot(data, legend=True)
            plt.savefig(region + os.path.sep + "trace_plot.pdf")
            az.plot_posterior(data,  hdi_prob = 0.95)
            plt.savefig(region + os.path.sep + "post_dist.pdf")

        if compute_sim:
            #rk.set_output_gradient_flag(False)
            n_predictions = 7
            rk.final_time = rk.final_time + n_predictions
            rk.n_steps = rk.n_steps + m*n_predictions

            y_sims = pm.sample_posterior_predictive(trace)['y_sim'][:,0,:]
            np.savetxt(region + os.path.sep + "y_sims.csv", y_sims, delimiter = ',')
            mean_y = np.mean(y_sims,axis=0)
            upper_y = np.percentile(y_sims,q=97.5,axis=0)
            lower_y = np.percentile(y_sims,q=2.5,axis=0)
    
            # plots
            dates = [dt.datetime.strptime(date, "%Y-%m-%d").date() for date in datetimes]
            pred_dates = dates + [dates[-1] + dt.timedelta(days=i) for i in range(1,1 + n_predictions)]

            np.savetxt(region + os.path.sep + "y_obs.csv", y_obs, delimiter = ',')

            dates_csv = pd.DataFrame(pred_dates).to_csv(region + os.path.sep + 'dates.csv', header=False, index=False)
            # linear plot
            font_size = 12
            fig, ax = plt.subplots(figsize=(10, 10))
            ax.plot(dates, y_obs, 'x', color='k', label='reported data')
            import matplotlib.dates as mdates
            ax.xaxis.set_major_formatter(mdates.DateFormatter('%d %b'))
            ax.xaxis.set_major_locator(mdates.DayLocator(bymonthday=(1,15)))
            plt.title(region[0].upper() + region[1:].lower() + "'s daily infections", fontsize = font_size)
            plt.xlabel('Date', fontsize = font_size)
            plt.ylabel('New daily infections', fontsize = font_size)
            ax.tick_params(axis='both', which='major', labelsize=10)

            # plot propagated uncertainty
            plt.plot(pred_dates, mean_y, color='g', lw=2, label='mean')
            plt.fill_between(pred_dates, lower_y, upper_y, color='darkseagreen', label='95% credible interval')
            plt.legend(loc='upper left')
            fig.autofmt_xdate()
            plt.savefig(region + os.path.sep + "linear.pdf")

            # log plot
            plt.yscale('log')
            plt.savefig(region + os.path.sep + "log.pdf")
    
    print("finished ... " + region)
    def __init__(self,
                 cell_state_mat: np.ndarray,
                 X_data: np.ndarray,
                 Y_data: np.ndarray,
                 n_comb: int = 50,
                 data_type: str = 'float32',
                 n_iter=20000,
                 learning_rate=0.005,
                 total_grad_norm_constraint=200,
                 verbose=True,
                 var_names=None,
                 var_names_read=None,
                 obs_names=None,
                 fact_names=None,
                 sample_id=None,
                 gene_level_prior={
                     'mean': 1 / 2,
                     'sd': 1 / 4
                 },
                 gene_level_var_prior={'mean_var_ratio': 1},
                 cell_number_prior={
                     'cells_per_spot': 8,
                     'factors_per_spot': 7,
                     'combs_per_spot': 2.5
                 },
                 cell_number_var_prior={
                     'cells_mean_var_ratio': 1,
                     'factors_mean_var_ratio': 1,
                     'combs_mean_var_ratio': 1
                 },
                 phi_hyp_prior={
                     'mean': 3,
                     'sd': 1
                 },
                 spot_fact_mean_var_ratio=0.5):

        ############# Initialise parameters ################
        super().__init__(cell_state_mat, X_data, data_type, n_iter,
                         learning_rate, total_grad_norm_constraint, verbose,
                         var_names, var_names_read, obs_names, fact_names,
                         sample_id)

        self.Y_data = Y_data
        self.n_npro = Y_data.shape[1]
        self.y_data = theano.shared(Y_data.astype(self.data_type))
        self.n_rois = Y_data.shape[0]
        # Total number of gene counts in each region of interest, divided by 10^5:
        self.l_r = np.array([np.sum(X_data[i, :]) for i in range(self.n_rois)
                             ]).reshape(self.n_rois, 1) * 10**(-5)

        for k in gene_level_var_prior.keys():
            gene_level_prior[k] = gene_level_var_prior[k]

        self.gene_level_prior = gene_level_prior
        self.phi_hyp_prior = phi_hyp_prior
        self.n_comb = n_comb
        self.spot_fact_mean_var_ratio = spot_fact_mean_var_ratio

        cell_number_prior['factors_per_combs'] = (
            cell_number_prior['factors_per_spot'] /
            cell_number_prior['combs_per_spot'])
        for k in cell_number_var_prior.keys():
            cell_number_prior[k] = cell_number_var_prior[k]
        self.cell_number_prior = cell_number_prior

        ############# Define the model ################
        self.model = pm.Model()

        with self.model:

            # ===================== Non-specific binding additive component ======================= #
            # Additive term for non-specific binding of gene probes are drawn from a gamma distribution with
            # the same mean and variance as for negative probes above.
            self.gene_add_hyp = pm.Gamma('gene_add_hyp', 1, 1, shape=2)
            self.gene_add = pm.Gamma('gene_add',
                                     self.gene_add_hyp[0],
                                     self.gene_add_hyp[1],
                                     shape=(self.n_genes, 1))

            # =====================Gene expression level scaling======================= #
            # Explains difference in expression between genes and
            # how it differs in single cell and spatial technology
            # compute hyperparameters from mean and sd
            shape = gene_level_prior['mean']**2 / gene_level_prior['sd']**2
            rate = gene_level_prior['mean'] / gene_level_prior['sd']**2
            shape_var = shape / gene_level_prior['mean_var_ratio']
            rate_var = rate / gene_level_prior['mean_var_ratio']
            self.gene_level_alpha_hyp = pm.Gamma('gene_level_alpha_hyp',
                                                 mu=shape,
                                                 sigma=np.sqrt(shape_var),
                                                 shape=(1, 1))
            self.gene_level_beta_hyp = pm.Gamma('gene_level_beta_hyp',
                                                mu=rate,
                                                sigma=np.sqrt(rate_var),
                                                shape=(1, 1))

            self.gene_level = pm.Gamma('gene_level',
                                       self.gene_level_alpha_hyp,
                                       self.gene_level_beta_hyp,
                                       shape=(self.n_genes, 1))

            self.gene_factors = pm.Deterministic('gene_factors',
                                                 self.cell_state)

            # =====================Spot factors======================= #
            # prior on spot factors reflects the number of cells, fraction of their cytoplasm captured,
            # times heterogeniety in the total number of mRNA between individual cells with each cell type
            self.cells_per_spot = pm.Gamma('cells_per_spot',
                                           mu=cell_number_prior['cells_per_spot'],
                                           sigma=np.sqrt(cell_number_prior['cells_per_spot'] \
                                                         / cell_number_prior['cells_mean_var_ratio']),
                                           shape=(self.n_cells, 1))
            self.comb_per_spot = pm.Gamma('combs_per_spot',
                                          mu=cell_number_prior['combs_per_spot'],
                                          sigma=np.sqrt(cell_number_prior['combs_per_spot'] \
                                                        / cell_number_prior['combs_mean_var_ratio']),
                                          shape=(self.n_cells, 1))

            shape = self.comb_per_spot / np.array(self.n_comb).reshape((1, 1))
            rate = tt.ones((1, 1)) / self.cells_per_spot * self.comb_per_spot
            self.combs_factors = pm.Gamma('combs_factors',
                                          alpha=shape,
                                          beta=rate,
                                          shape=(self.n_cells, self.n_comb))

            self.factors_per_combs = pm.Gamma('factors_per_combs',
                                              mu=cell_number_prior['factors_per_combs'],
                                              sigma=np.sqrt(cell_number_prior['factors_per_combs'] \
                                                            / cell_number_prior['factors_mean_var_ratio']),
                                              shape=(self.n_comb, 1))
            c2f_shape = self.factors_per_combs / np.array(self.n_fact).reshape(
                (1, 1))
            self.comb2fact = pm.Gamma('comb2fact',
                                      alpha=c2f_shape,
                                      beta=self.factors_per_combs,
                                      shape=(self.n_comb, self.n_fact))

            self.spot_factors = pm.Gamma('spot_factors', mu=pm.math.dot(self.combs_factors, self.comb2fact),
                                         sigma=pm.math.sqrt(pm.math.dot(self.combs_factors, self.comb2fact) \
                                                            / self.spot_fact_mean_var_ratio),
                                         shape=(self.n_cells, self.n_fact))

            # =====================Spot-specific additive component======================= #
            # molecule contribution that cannot be explained by cell state signatures
            # these counts are distributed between all genes not just expressed genes
            self.spot_add_hyp = pm.Gamma('spot_add_hyp', 1, 1, shape=2)
            self.spot_add = pm.Gamma('spot_add',
                                     self.spot_add_hyp[0],
                                     self.spot_add_hyp[1],
                                     shape=(self.n_cells, 1))

            # =====================Gene-specific overdispersion ======================= #
            self.phi_hyp = pm.Gamma('phi_hyp',
                                    mu=phi_hyp_prior['mean'],
                                    sigma=phi_hyp_prior['sd'],
                                    shape=(1, 1))
            self.gene_E = pm.Exponential('gene_E',
                                         self.phi_hyp,
                                         shape=(self.n_genes, 1))

            # =====================Expected expression ======================= #
            # Expected counts for negative probes and gene probes concatenated into one array. Note that non-specific binding
            # scales linearly with the total number of counts (l_r) in this model.
            self.mu_biol = pm.math.dot(self.spot_factors, self.gene_factors.T) * self.gene_level.T \
               + self.gene_add.T + self.spot_add

            # =====================DATA likelihood ======================= #
            # Likelihood (sampling distribution) of observations & add overdispersion via NegativeBinomial / Poisson
            self.data_target = pm.NegativeBinomial(
                'data_target',
                mu=self.mu_biol,
                alpha=1 / (self.gene_E.T * self.gene_E.T),
                observed=self.x_data)

            # =====================Compute nUMI from each factor in spots  ======================= #
            self.nUMI_factors = pm.Deterministic(
                'nUMI_factors', (self.spot_factors *
                                 (self.gene_factors * self.gene_level).sum(0)))
Exemplo n.º 24
0
# Convert categorical variables to integer
le = preprocessing.LabelEncoder()
data_idx = le.fit_transform(data.Hour)
hours = le.classes_
n_hours = len(hours)

for h in [8]:
    print('Hour: ', h)
    with pm.Model() as model:
        alpha = pm.Uniform('alpha', lower=0, upper=20)
        mu = pm.Uniform('mu', lower=0, upper=20)

        y_obs = data[data.Hour == h]['Connected'].values

        y_est = pm.NegativeBinomial('y_est',
                                    mu=mu,
                                    alpha=alpha,
                                    observed=y_obs)

        y_pred = pm.NegativeBinomial('y_pred', mu=mu, alpha=alpha)

        trace = pm.sample(25, progressbar=True)

        indiv_traces[h] = trace

#%% Plot NegBino Traces per Hour

fig, axs = plt.subplots(n_hours, 2, figsize=(10, 48))
axs = axs.ravel()

colLeft = np.arange(0, 48, 2)
colRight = np.arange(1, 48, 2)
Exemplo n.º 25
0
def lohhla_clone_model(sample_ids,
                       tree_edges,
                       clonal_prevalence_mat,
                       cellularity,
                       ploidy_values,
                       tumour_sample_reads,
                       normal_sample_reads,
                       integercpn_info,
                       all_genotypes,
                       transition_inputs,
                       stayrate_alpha=0.9,
                       stayrate_beta=0.1,
                       sd=0.5,
                       nb_alpha=0.5,
                       iter_count=20000,
                       tune_iters=20000,
                       anchor_type='nb',
                       anchor_mode='snvcn',
                       nchains=2,
                       njobs=2):
    '''
    stayrate_alpha: Beta prior alpha-parameter on stayrate in clone tree Markov chain
    stayrate_beta: Beta prior beta-parameter on stayrate in clone tree Markov chain
    all_genotypes: Dataframe of genotypes, 0-indexed
    '''
    num_nodes = clonal_prevalence_mat.shape[1]

    valid_transitions = transition_inputs['valid_transitions']
    num_transitions = transition_inputs['num_transitions']
    num_genotypes = transition_inputs['num_genotypes']
    cn_genotype_matrix = transition_inputs['cn_genotype_matrix']

    ## Beta-binomial dispersion (higher = less dispersed)
    dispersion = 200.

    ## Tree edges
    edges = tree_edges.as_matrix().astype(int) - 1

    with pm.Model() as model:
        BoundedNormal = pm.Bound(pm.Normal, lower=0., upper=1.)
        stay_rate = BoundedNormal('stayrate', mu=0.75, sd=0.4)

        P = np.zeros(shape=(num_genotypes, num_genotypes))
        P = P + tt.eye(num_genotypes) * stay_rate

        fill_values = tt.as_tensor((1. - stay_rate) / num_transitions)
        fill_values = tt.set_subtensor(fill_values[0], 0)

        P = P + valid_transitions * fill_values[:, np.newaxis]
        P = tt.set_subtensor(P[0, 0], 1.)

        A = tt.dmatrix('A')

        PA = tt.ones(shape=(num_genotypes)) / num_genotypes

        states = CloneTreeGenotypes('genotypes',
                                    PA=PA,
                                    P=P,
                                    edges=edges,
                                    k=num_genotypes,
                                    shape=(num_nodes))

        total_cns = theano.shared(np.array(all_genotypes['total_cn'].values))
        alt_cns = theano.shared(np.array(all_genotypes['alt_cn'].values))

        total_cn = pm.Deterministic('total_cn', total_cns[states])
        alt_cn = pm.Deterministic('alt_cn', alt_cns[states])

        sample_alt_copies = tt.dot(clonal_prevalence_mat, alt_cn
                                   ) * cellularity + (1. - cellularity) * 1.

        vafs = sample_alt_copies / (
            tt.dot(clonal_prevalence_mat, total_cn) * cellularity +
            (1. - cellularity) * 2.)
        pm.Deterministic('vafs', vafs)

        alphas = vafs * dispersion
        betas = (1 - vafs) * dispersion

        ## Copy number of tumour cells (aggregated over clones, but not including normal contamination)
        tutotalcn = pm.Deterministic('tutotalcn',
                                     tt.dot(clonal_prevalence_mat, total_cn))

        ## Can't be vectorized further
        for j in range(len(sample_ids)):
            current_sample = sample_ids[j]
            total_counts = integercpn_info['TumorCov_type1'][
                current_sample].values + integercpn_info['TumorCov_type2'][
                    current_sample].values
            alt_counts = integercpn_info['TumorCov_type2'][
                current_sample].values
            alpha_sel = alphas[j]
            beta_sel = betas[j]

            ## Draw alternative allele counts for HLA locus for each polymorphic site
            alt_reads = pm.BetaBinomial('x_' + str(j),
                                        alpha=alpha_sel,
                                        beta=beta_sel,
                                        n=total_counts,
                                        observed=alt_counts)

            mult_factor_mean = (tumour_sample_reads[current_sample] /
                                normal_sample_reads)

            ploidy = ploidy_values[j]
            ploidy_ratio = (tutotalcn[j] * cellularity[j] +
                            (1 - cellularity[j]) * 2) / (
                                cellularity[j] * ploidy +
                                (1 - cellularity[j]) * 2)
            if anchor_mode == 'snvcn':
                mult_factor_computed = pm.Deterministic(
                    'mult_factor_computed_' + str(j), 1. / ploidy_ratio *
                    (integercpn_info['Total_TumorCov'][current_sample].values /
                     integercpn_info['Total_NormalCov'][current_sample].values)
                )
                nloci = len(
                    integercpn_info['Total_TumorCov'][current_sample].values)

                tumour_reads_observed = integercpn_info['Total_TumorCov'][
                    current_sample].values
                normal_reads_observed = integercpn_info['Total_NormalCov'][
                    current_sample].values
            elif anchor_mode == 'binmedian':
                binvar_tumour = 'combinedBinTumor'
                binvar_normal = 'combinedBinNormal'
                ## All within a bin are the same, so this is OK
                duplicated_entries = integercpn_info['binNum'][
                    current_sample].duplicated(keep='first')
                nloci = len(integercpn_info[binvar_tumour][current_sample]
                            [~duplicated_entries].values)

                mult_factor_computed = pm.Deterministic(
                    'mult_factor_computed_' + str(j),
                    (1. / ploidy_ratio *
                     (integercpn_info[binvar_tumour][current_sample]
                      [~duplicated_entries].values /
                      integercpn_info[binvar_normal][current_sample]
                      [~duplicated_entries].values)))

                tumour_reads_observed = integercpn_info[binvar_tumour][
                    current_sample][~duplicated_entries].values
                normal_reads_observed = integercpn_info[binvar_normal][
                    current_sample][~duplicated_entries].values
            else:
                raise Exception("Invalid option specified.")

            ## Draw ploidy-corrected tumour/normal locus coverage ratio for each polymorphic site

            if anchor_type == 'mult_factor':
                mult_factor = pm.Lognormal('mult_factor_' + str(j),
                                           mu=np.log(mult_factor_mean),
                                           sd=sd,
                                           observed=mult_factor_computed,
                                           shape=(nloci))
            elif anchor_type == 'nb':
                tc_nc_ratio = pm.Deterministic(
                    'tc_nc_ratio_' + str(j), (tutotalcn[j] * cellularity[j] +
                                              (1 - cellularity[j]) * 2) /
                    (ploidy * cellularity[j] + (1 - cellularity[j]) * 2))

                tumoursamplecn = pm.Deterministic(
                    'tumoursamplecn_' + str(j),
                    (tutotalcn[j] * cellularity[j] + (1 - cellularity[j]) * 2))

                tumour_reads_mean = pm.Deterministic(
                    'tumour_reads_mean_' + str(j),
                    tc_nc_ratio * mult_factor_mean * normal_reads_observed)

                tumour_reads = pm.NegativeBinomial(
                    'tumour_reads_' + str(j),
                    mu=tumour_reads_mean,
                    alpha=nb_alpha,
                    observed=tumour_reads_observed)
            else:
                raise Exception('Must specify a valid model type.')

        pm.Deterministic('log_prob', model.logpt)

        step1 = pm.CategoricalGibbsMetropolis(vars=[states])
        step2 = pm.Metropolis(vars=[stay_rate])

        trace = pm.sample(iter_count,
                          tune=tune_iters,
                          step=[step1, step2],
                          njobs=njobs,
                          chains=nchains)

        return trace
    def __init__(
        self,
        cell_state_mat: np.ndarray,
        X_data: np.ndarray,
        n_comb: int = 50,
        data_type: str = 'float32',
        n_iter=20000,
        learning_rate=0.005,
        total_grad_norm_constraint=200,
        verbose=True,
        var_names=None,
        var_names_read=None,
        obs_names=None,
        fact_names=None,
        sample_id=None,
        cell_number_prior={
            'cells_per_spot': 8,
            'factors_per_spot': 7,
            'combs_per_spot': 2.5
        },
        cell_number_var_prior={
            'cells_mean_var_ratio': 1,
            'factors_mean_var_ratio': 1,
            'combs_mean_var_ratio': 1
        },
        phi_hyp_prior={
            'mean': 3,
            'sd': 1
        },
        spot_fact_mean_var_ratio=5,
        exper_gene_level_mean_var_ratio=10,
    ):

        ############# Initialise parameters ################
        super().__init__(cell_state_mat, X_data, data_type, n_iter,
                         learning_rate, total_grad_norm_constraint, verbose,
                         var_names, var_names_read, obs_names, fact_names,
                         sample_id)

        self.phi_hyp_prior = phi_hyp_prior
        self.n_comb = n_comb
        self.spot_fact_mean_var_ratio = spot_fact_mean_var_ratio
        self.exper_gene_level_mean_var_ratio = exper_gene_level_mean_var_ratio

        # generate parameters for samples
        self.spot2sample_df = pd.get_dummies(sample_id)
        # convert to np.ndarray
        self.spot2sample_mat = self.spot2sample_df.values
        self.n_exper = self.spot2sample_mat.shape[1]
        # assign extra data to dictionary with (1) shared parameters (2) input data
        self.extra_data_tt = {
            'spot2sample':
            theano.shared(self.spot2sample_mat.astype(self.data_type))
        }
        self.extra_data = {
            'spot2sample': self.spot2sample_mat.astype(self.data_type)
        }

        cell_number_prior['factors_per_combs'] = (
            cell_number_prior['factors_per_spot'] /
            cell_number_prior['combs_per_spot'])
        for k in cell_number_var_prior.keys():
            cell_number_prior[k] = cell_number_var_prior[k]
        self.cell_number_prior = cell_number_prior

        ############# Define the model ################
        self.model = pm.Model()

        with self.model:

            # =====================Gene expression level scaling======================= #
            # scale cell state factors by gene_level
            self.gene_factors = pm.Deterministic('gene_factors',
                                                 self.cell_state)
            #self.gene_factors = self.cell_state
            # tt.printing.Print('gene_factors sum')(gene_factors.sum(0).shape)
            # tt.printing.Print('gene_factors sum')(gene_factors.sum(0))

            # =====================Spot factors======================= #
            # prior on spot factors reflects the number of cells, fraction of their cytoplasm captured,
            # times heterogeniety in the total number of mRNA between individual cells with each cell type
            self.cells_per_spot = pm.Gamma('cells_per_spot',
                                           mu=cell_number_prior['cells_per_spot'],
                                           sigma=np.sqrt(cell_number_prior['cells_per_spot'] \
                                                         / cell_number_prior['cells_mean_var_ratio']),
                                           shape=(self.n_obs, 1))
            self.comb_per_spot = pm.Gamma('combs_per_spot',
                                          mu=cell_number_prior['combs_per_spot'],
                                          sigma=np.sqrt(cell_number_prior['combs_per_spot'] \
                                                        / cell_number_prior['combs_mean_var_ratio']),
                                          shape=(self.n_obs, 1))

            shape = self.comb_per_spot / np.array(self.n_comb).reshape((1, 1))
            rate = tt.ones((1, 1)) / self.cells_per_spot * self.comb_per_spot
            self.combs_factors = pm.Gamma('combs_factors',
                                          alpha=shape,
                                          beta=rate,
                                          shape=(self.n_obs, self.n_comb))

            self.factors_per_combs = pm.Gamma('factors_per_combs',
                                              mu=cell_number_prior['factors_per_combs'],
                                              sigma=np.sqrt(cell_number_prior['factors_per_combs'] \
                                                            / cell_number_prior['factors_mean_var_ratio']),
                                              shape=(self.n_comb, 1))
            c2f_shape = self.factors_per_combs / np.array(self.n_fact).reshape(
                (1, 1))
            self.comb2fact = pm.Gamma('comb2fact',
                                      alpha=c2f_shape,
                                      beta=self.factors_per_combs,
                                      shape=(self.n_comb, self.n_fact))

            self.spot_factors = pm.Gamma('spot_factors', mu=pm.math.dot(self.combs_factors, self.comb2fact),
                                         sigma=pm.math.sqrt(pm.math.dot(self.combs_factors, self.comb2fact) \
                                                            / self.spot_fact_mean_var_ratio),
                                         shape=(self.n_obs, self.n_fact))

            # =====================Spot-specific additive component======================= #
            # molecule contribution that cannot be explained by cell state signatures
            # these counts are distributed between all genes not just expressed genes
            self.spot_add_hyp = pm.Gamma('spot_add_hyp', 1, 1, shape=2)
            self.spot_add = pm.Gamma('spot_add',
                                     self.spot_add_hyp[0],
                                     self.spot_add_hyp[1],
                                     shape=(self.n_obs, 1))

            # =====================Gene-specific additive component ======================= #
            # per gene molecule contribution that cannot be explained by cell state signatures
            # these counts are distributed equally between all spots (e.g. background, free-floating RNA)
            self.gene_add_hyp = pm.Gamma('gene_add_hyp', 1, 1, shape=2)
            self.gene_add = pm.Gamma('gene_add',
                                     self.gene_add_hyp[0],
                                     self.gene_add_hyp[1],
                                     shape=(self.n_exper, self.n_var))

            # =====================Gene-specific overdispersion ======================= #
            self.phi_hyp = pm.Gamma('phi_hyp',
                                    mu=phi_hyp_prior['mean'],
                                    sigma=phi_hyp_prior['sd'],
                                    shape=(1, 1))
            self.gene_E = pm.Exponential('gene_E',
                                         self.phi_hyp,
                                         shape=(self.n_exper, self.n_var))

            # =====================Expected expression ======================= #
            # expected expression
            self.mu_biol = pm.math.dot(self.spot_factors, self.gene_factors.T) \
                           + pm.math.dot(self.extra_data_tt['spot2sample'], self.gene_add) + self.spot_add
            # tt.printing.Print('mu_biol')(self.mu_biol.shape)

            # =====================DATA likelihood ======================= #
            # Likelihood (sampling distribution) of observations & add overdispersion via NegativeBinomial / Poisson
            self.data_target = pm.NegativeBinomial(
                'data_target',
                mu=self.mu_biol,
                alpha=pm.math.dot(self.extra_data_tt['spot2sample'],
                                  1 / tt.pow(self.gene_E, 2)),
                observed=self.x_data,
                total_size=self.X_data.shape)

            # =====================Compute nUMI from each factor in spots  ======================= #
            self.nUMI_factors = pm.Deterministic('nUMI_factors',
                                                 (self.spot_factors *
                                                  (self.gene_factors).sum(0)))
Exemplo n.º 27
0
with pm.Model() as model:
    hyper_alpha_sd = pm.Uniform('hyper_alpha_sd', lower=0, upper=50)
    hyper_alpha_mu = pm.Uniform('hyper_alpha_mu', lower=0, upper=10)

    hyper_mu_sd = pm.Uniform('hyper_mu_sd', lower=0, upper=50)
    hyper_mu_mu = pm.Uniform('hyper_mu_mu', lower=0, upper=60)

    alpha = pm.Gamma('alpha',
                     mu=hyper_alpha_mu,
                     sd=hyper_alpha_sd,
                     shape=n_participants)
    mu = pm.Gamma('mu', mu=hyper_mu_mu, sd=hyper_mu_sd, shape=n_participants)

    y_est = pm.NegativeBinomial('y_est',
                                mu=mu[participants_idx],
                                alpha=alpha[participants_idx],
                                observed=messages['time_delay_seconds'].values)

    y_pred = pm.NegativeBinomial('y_pred',
                                 mu=mu[participants_idx],
                                 alpha=alpha[participants_idx],
                                 shape=messages['prev_sender'].shape)

    start = pm.find_MAP()
    step = pm.Metropolis()
    hierarchical_trace = pm.sample(20000, step=step, progressbar=True)

_ = pm.traceplot(hierarchical_trace[12000:],
                 varnames=[
                     'mu', 'alpha', 'hyper_mu_mu', 'hyper_mu_sd',
                     'hyper_alpha_mu', 'hyper_alpha_sd'
Exemplo n.º 28
0
y = nbinom.rvs(mu, 0.5)

with pm.Model() as model:
    # Define priors
    alpha = pm.Uniform('sigma', 0, 100)
    sigma_a = pm.Uniform('sigma_a', 0, 10)
    beta1 = pm.Normal('beta1', 0, sd=100)
    beta2 = pm.Normal('beta2', 0, sd=100)
    beta3 = pm.Normal('beta3', 0, sd=100)

    # priors for random intercept (RI) parameters
    a_param = pm.Normal(
        'a_param',
        np.repeat(0, NGroups),  # mean
        sd=np.repeat(sigma_a, NGroups),  # standard deviation
        shape=NGroups)  # number of RI parameters

    eta = beta1 + beta2 * x1 + beta3 * x2 + a_param[Groups]

    # Define likelihood
    y = pm.NegativeBinomial('y', mu=pm.exp(eta), alpha=alpha, observed=y)

    # Fit
    start = pm.find_MAP()  # Find starting value by optimization
    step = pm.NUTS(state=start)  # Initiate sampling
    trace = pm.sample(7000, step, start=start)

# Print summary to screen
pm.summary(trace)
    def __init__(self,
                 cell_state_mat: np.ndarray,
                 X_data: np.ndarray,
                 n_comb: int = 50,
                 data_type: str = 'float32',
                 n_iter=20000,
                 learning_rate=0.005,
                 total_grad_norm_constraint=200,
                 verbose=True,
                 var_names=None,
                 var_names_read=None,
                 obs_names=None,
                 fact_names=None,
                 sample_id=None,
                 gene_level_prior={
                     'mean': 1 / 2,
                     'sd': 1 / 4
                 },
                 gene_level_var_prior={'mean_var_ratio': 1},
                 cell_number_prior={
                     'cells_per_spot': 8,
                     'factors_per_spot': 7,
                     'combs_per_spot': 2.5
                 },
                 cell_number_var_prior={
                     'cells_mean_var_ratio': 1,
                     'factors_mean_var_ratio': 1,
                     'combs_mean_var_ratio': 1
                 },
                 phi_hyp_prior={
                     'mean': 3,
                     'sd': 1
                 },
                 spot_fact_mean_var_ratio=5):

        ############# Initialise parameters ################
        super().__init__(cell_state_mat, X_data, data_type, n_iter,
                         learning_rate, total_grad_norm_constraint, verbose,
                         var_names, var_names_read, obs_names, fact_names,
                         sample_id)

        for k in gene_level_var_prior.keys():
            gene_level_prior[k] = gene_level_var_prior[k]

        self.gene_level_prior = gene_level_prior
        self.phi_hyp_prior = phi_hyp_prior
        self.n_comb = n_comb
        self.spot_fact_mean_var_ratio = spot_fact_mean_var_ratio

        cell_number_prior['factors_per_combs'] = (
            cell_number_prior['factors_per_spot'] /
            cell_number_prior['combs_per_spot'])
        for k in cell_number_var_prior.keys():
            cell_number_prior[k] = cell_number_var_prior[k]
        self.cell_number_prior = cell_number_prior

        ############# Define the model ################
        self.model = pm.Model()

        with self.model:

            # =====================Gene expression level scaling======================= #
            # Explains difference in expression between genes and
            # how it differs in single cell and spatial technology
            # compute hyperparameters from mean and sd
            shape = gene_level_prior['mean']**2 / gene_level_prior['sd']**2
            rate = gene_level_prior['mean'] / gene_level_prior['sd']**2
            shape_var = shape / gene_level_prior['mean_var_ratio']
            rate_var = rate / gene_level_prior['mean_var_ratio']
            n_g_prior = np.array(gene_level_prior['mean']).shape
            if len(n_g_prior) == 0:
                n_g_prior = 1
            else:
                n_g_prior = self.n_var
            self.gene_level_alpha_hyp = pm.Gamma('gene_level_alpha_hyp',
                                                 mu=shape,
                                                 sigma=np.sqrt(shape_var),
                                                 shape=(n_g_prior, 1))
            self.gene_level_beta_hyp = pm.Gamma('gene_level_beta_hyp',
                                                mu=rate,
                                                sigma=np.sqrt(rate_var),
                                                shape=(n_g_prior, 1))

            self.gene_level = pm.Gamma('gene_level',
                                       self.gene_level_alpha_hyp,
                                       self.gene_level_beta_hyp,
                                       shape=(self.n_var, 1))

            # scale cell state factors by gene_level
            self.gene_factors = pm.Deterministic('gene_factors',
                                                 self.cell_state)
            # tt.printing.Print('gene_factors sum')(gene_factors.sum(0).shape)
            # tt.printing.Print('gene_factors sum')(gene_factors.sum(0))

            # =====================Spot factors======================= #
            # prior on spot factors reflects the number of cells, fraction of their cytoplasm captured,
            # times heterogeniety in the total number of mRNA between individual cells with each cell type
            self.cells_per_spot = pm.Gamma('cells_per_spot',
                                           mu=cell_number_prior['cells_per_spot'],
                                           sigma=np.sqrt(cell_number_prior['cells_per_spot'] \
                                                         / cell_number_prior['cells_mean_var_ratio']),
                                           shape=(self.n_obs, 1))
            self.comb_per_spot = pm.Gamma('combs_per_spot',
                                          mu=cell_number_prior['combs_per_spot'],
                                          sigma=np.sqrt(cell_number_prior['combs_per_spot'] \
                                                        / cell_number_prior['combs_mean_var_ratio']),
                                          shape=(self.n_obs, 1))

            shape = self.comb_per_spot / np.array(self.n_comb).reshape((1, 1))
            rate = tt.ones((1, 1)) / self.cells_per_spot * self.comb_per_spot
            self.combs_factors = pm.Gamma('combs_factors',
                                          alpha=shape,
                                          beta=rate,
                                          shape=(self.n_obs, self.n_comb))

            self.factors_per_combs = pm.Gamma('factors_per_combs',
                                              mu=cell_number_prior['factors_per_combs'],
                                              sigma=np.sqrt(cell_number_prior['factors_per_combs'] \
                                                            / cell_number_prior['factors_mean_var_ratio']),
                                              shape=(self.n_comb, 1))

            c2f_shape = self.factors_per_combs / np.array(self.n_fact).reshape(
                (1, 1))
            self.comb2fact = pm.Gamma('comb2fact',
                                      alpha=c2f_shape,
                                      beta=self.factors_per_combs,
                                      shape=(self.n_comb, self.n_fact))

            self.spot_factors = pm.Gamma('spot_factors', mu=pm.math.dot(self.combs_factors, self.comb2fact),
                                         sigma=pm.math.sqrt(pm.math.dot(self.combs_factors, self.comb2fact) \
                                                            / self.spot_fact_mean_var_ratio),
                                         shape=(self.n_obs, self.n_fact))

            # =====================Spot-specific additive component======================= #
            # molecule contribution that cannot be explained by cell state signatures
            # these counts are distributed between all genes not just expressed genes
            self.spot_add_hyp = pm.Gamma('spot_add_hyp', 1, 1, shape=2)
            self.spot_add = pm.Gamma('spot_add',
                                     self.spot_add_hyp[0],
                                     self.spot_add_hyp[1],
                                     shape=(self.n_obs, 1))

            # =====================Gene-specific additive component ======================= #
            # per gene molecule contribution that cannot be explained by cell state signatures
            # these counts are distributed equally between all spots (e.g. background, free-floating RNA)
            self.gene_add_hyp = pm.Gamma('gene_add_hyp', 1, 1, shape=2)
            self.gene_add = pm.Gamma('gene_add',
                                     self.gene_add_hyp[0],
                                     self.gene_add_hyp[1],
                                     shape=(self.n_var, 1))

            # =====================Gene-specific overdispersion ======================= #
            self.phi_hyp = pm.Gamma('phi_hyp',
                                    mu=phi_hyp_prior['mean'],
                                    sigma=phi_hyp_prior['sd'],
                                    shape=(1, 1))
            self.gene_E = pm.Exponential('gene_E',
                                         self.phi_hyp,
                                         shape=(self.n_var, 1))

            # =====================Expected expression ======================= #
            # expected expression
            self.mu_biol = pm.math.dot(self.spot_factors, self.gene_factors.T) * self.gene_level.T \
                           + self.gene_add.T + self.spot_add
            # tt.printing.Print('mu_biol')(self.mu_biol.shape)

            # =====================DATA likelihood ======================= #
            # Likelihood (sampling distribution) of observations & add overdispersion via NegativeBinomial / Poisson
            self.data_target = pm.NegativeBinomial(
                'data_target',
                mu=self.mu_biol,
                alpha=1 / (self.gene_E.T * self.gene_E.T),
                observed=self.x_data,
                total_size=self.X_data.shape)

            # =====================Compute nUMI from each factor in spots  ======================= #
            self.nUMI_factors = pm.Deterministic(
                'nUMI_factors', (self.spot_factors *
                                 (self.gene_factors * self.gene_level).sum(0)))
Exemplo n.º 30
0
    def __init__(
            self,
            cell_state_mat: np.ndarray,
            X_data: np.ndarray,
            Y_data: np.ndarray,
            n_comb: int = 50,
            data_type: str = 'float32',
            n_iter=20000,
            learning_rate=0.005,
            total_grad_norm_constraint=200,
            verbose=True,
            var_names=None, var_names_read=None,
            obs_names=None, fact_names=None, sample_id=None,
            gene_level_prior={'mean': 1 / 2, 'sd': 1 / 4, 'sample_alpha': 20},
            gene_level_var_prior={'mean_var_ratio': 1},
            cell_number_prior={'cells_per_spot': 8,
                               'factors_per_spot': 7,
                               'combs_per_spot': 2.5},
            cell_number_var_prior={'cells_mean_var_ratio': 1,
                                   'factors_mean_var_ratio': 1,
                                   'combs_mean_var_ratio': 1},
            phi_hyp_prior={'mean': 3, 'sd': 1},
            spot_fact_mean_var_ratio=0.5
    ):      
        

        ############# Initialise parameters ################
        super().__init__(cell_state_mat, X_data,
                         data_type, n_iter,
                         learning_rate, total_grad_norm_constraint,
                         verbose, var_names, var_names_read,
                         obs_names, fact_names, sample_id)
        
        self.Y_data = Y_data
        self.n_npro = Y_data.shape[1]
        self.y_data = theano.shared(Y_data.astype(self.data_type))
        self.n_rois = Y_data.shape[0]
        self.n_genes = X_data.shape[1]
        # Total number of gene counts in each region of interest, divided by 10^5:
        self.l_r = np.array([np.sum(X_data[i,:]) for i in range(self.n_rois)]).reshape(self.n_rois,1)*10**(-5)
        
        for k in gene_level_var_prior.keys():
            gene_level_prior[k] = gene_level_var_prior[k]

        self.gene_level_prior = gene_level_prior
        self.phi_hyp_prior = phi_hyp_prior
        self.n_comb = n_comb
        self.spot_fact_mean_var_ratio = spot_fact_mean_var_ratio

        cell_number_prior['factors_per_combs'] = (cell_number_prior['factors_per_spot'] /
                                                  cell_number_prior['combs_per_spot'])
        for k in cell_number_var_prior.keys():
            cell_number_prior[k] = cell_number_var_prior[k]
        self.cell_number_prior = cell_number_prior
        
        # generate one-hot encoded parameters for samples
        self.spot2sample_df = pd.get_dummies(sample_id)
        # convert to np.ndarray
        self.spot2sample_mat = self.spot2sample_df.values
        self.n_exper = self.spot2sample_mat.shape[1]
        # assign extra data to dictionary with (1) shared parameters (2) input data
        self.extra_data_tt = {'spot2sample': theano.shared(self.spot2sample_mat.astype(self.data_type))}
        self.extra_data = {'spot2sample': self.spot2sample_mat.astype(self.data_type)}

        ############# Define the model ################
        self.model = pm.Model()

        with self.model:
            
            # ============================ Negative Probe Binding ===================== #
            # Negative probe counts scale linearly with the total number of counts in a region of interest.
            # The linear slope is drawn from a gamma distribution. Mean and variance are inferred from the data
            # and are the same for the non-specific binding term for gene probes further below.
            self.b_n_hyper = pm.Gamma('b_n_hyper', alpha=np.array((3,1)), beta=np.array((1,1)), shape=2)
            self.b_n = pm.Gamma('b_n', mu=self.b_n_hyper[0], sigma=self.b_n_hyper[1], 
                                shape=(self.n_exper, self.n_npro))
            self.y_rn = pm.math.dot(self.extra_data_tt['spot2sample'], self.b_n) * self.l_r
            
            # ===================== Non-specific binding additive component ======================= #
            # Additive term for non-specific binding of gene probes are drawn from a gamma distribution with
            # the same mean and variance as for negative probes above.
            self.gene_add = pm.Gamma('gene_add', mu=self.b_n_hyper[0], sigma=self.b_n_hyper[1], 
                                     shape=(self.n_exper, self.n_genes))

            # =====================Gene expression level scaling======================= #
            # Explains difference in expression between genes and 
            # how it differs in single cell and spatial technology
            # compute hyperparameters from mean and sd
            shape = gene_level_prior['mean'] ** 2 / gene_level_prior['sd'] ** 2
            rate = gene_level_prior['mean'] / gene_level_prior['sd'] ** 2
            shape_var = shape / gene_level_prior['mean_var_ratio']
            rate_var = rate / gene_level_prior['mean_var_ratio']
            self.gene_level_alpha_hyp = pm.Gamma('gene_level_alpha_hyp',
                                                 mu=shape, sigma=np.sqrt(shape_var),
                                                 shape=(1, 1))
            self.gene_level_beta_hyp = pm.Gamma('gene_level_beta_hyp',
                                                mu=rate, sigma=np.sqrt(rate_var),
                                                shape=(1, 1))
            # global per gene sensitivity, including platform effect
            self.gene_level = pm.Gamma('gene_level', self.gene_level_alpha_hyp,
                                       self.gene_level_beta_hyp, shape=(1, self.n_genes))
            
            # independent experiment-specific effect on each gene (narrow prior around 1)
            self.gene_level_independent = pm.Gamma('gene_level_independent', 
                                                   100, 100, shape=(self.n_exper, self.n_genes))
            # experiment specific capture efficiency (wide prior around 1)
            self.gene_level_e = pm.Gamma('gene_level_e', gene_level_prior['sample_alpha'],
                                         gene_level_prior['sample_alpha'], shape=(self.n_exper, 1))

            self.gene_factors = pm.Deterministic('gene_factors', self.cell_state)

            # =====================Spot factors======================= #
            # prior on spot factors reflects the number of cells, fraction of their cytoplasm captured, 
            # times heterogeniety in the total number of mRNA between individual cells with each cell type
            self.cells_per_spot = pm.Gamma('cells_per_spot',
                                           mu=cell_number_prior['cells_per_spot'],
                                           sigma=np.sqrt(cell_number_prior['cells_per_spot'] \
                                                         / cell_number_prior['cells_mean_var_ratio']),
                                           shape=(self.n_rois, 1))
            self.comb_per_spot = pm.Gamma('combs_per_spot',
                                          mu=cell_number_prior['combs_per_spot'],
                                          sigma=np.sqrt(cell_number_prior['combs_per_spot'] \
                                                        / cell_number_prior['combs_mean_var_ratio']),
                                          shape=(self.n_rois, 1))

            shape = self.comb_per_spot / np.array(self.n_comb).reshape((1, 1))
            rate = tt.ones((1, 1)) / self.cells_per_spot * self.comb_per_spot
            self.combs_factors = pm.Gamma('combs_factors', alpha=shape, beta=rate,
                                          shape=(self.n_rois, self.n_comb))

            self.factors_per_combs = pm.Gamma('factors_per_combs',
                                              mu=cell_number_prior['factors_per_combs'],
                                              sigma=np.sqrt(cell_number_prior['factors_per_combs'] \
                                                            / cell_number_prior['factors_mean_var_ratio']),
                                              shape=(self.n_comb, 1))
            c2f_shape = self.factors_per_combs / np.array(self.n_fact).reshape((1, 1))
            self.comb2fact = pm.Gamma('comb2fact', alpha=c2f_shape, beta=self.factors_per_combs,
                                      shape=(self.n_comb, self.n_fact))

            self.spot_factors = pm.Gamma('spot_factors', mu=pm.math.dot(self.combs_factors, self.comb2fact),
                                         sigma=pm.math.sqrt(pm.math.dot(self.combs_factors, self.comb2fact) \
                                                            / self.spot_fact_mean_var_ratio),
                                         shape=(self.n_rois, self.n_fact))

            # =====================Spot-specific additive component======================= #
            # molecule contribution that cannot be explained by cell state signatures
            # these counts are distributed between all genes not just expressed genes
            self.spot_add_hyp = pm.Gamma('spot_add_hyp', 1, 1, shape=2)
            self.spot_add = pm.Gamma('spot_add', self.spot_add_hyp[0],
                                     self.spot_add_hyp[1], shape=(self.n_rois, 1))            

            # =====================Gene-specific overdispersion ======================= #
            self.phi_hyp = pm.Gamma('phi_hyp', mu=phi_hyp_prior['mean'],
                                    sigma=phi_hyp_prior['sd'], shape=(1, 1))
            self.gene_E = pm.Exponential('gene_E', self.phi_hyp, shape=(self.n_exper, self.n_genes))

            # =====================Expected expression ======================= #
            # Expected counts for negative probes and gene probes concatenated into one array. Note that non-specific binding
            # scales linearly with the total number of counts (l_r) in this model.
            mu_biol = pm.math.dot(self.spot_factors, self.gene_factors.T) \
            * self.gene_level \
            * pm.math.dot(self.extra_data_tt['spot2sample'], self.gene_level_e) \
            * pm.math.dot(self.extra_data_tt['spot2sample'], self.gene_level_independent) \
            + pm.math.dot(self.extra_data_tt['spot2sample'], self.gene_add) * self.l_r \
            + self.spot_add
            self.mu_biol = tt.concatenate([self.y_rn, mu_biol], axis = 1)

            # =====================DATA likelihood ======================= #
            # Likelihood (sampling distribution) of observations & add overdispersion via NegativeBinomial / Poisson            
            self.data_target = pm.NegativeBinomial('data_target', mu=self.mu_biol,
                               alpha=tt.concatenate([np.full((self.n_rois, self.n_npro), 10**10),
                                                     pm.math.dot(self.extra_data_tt['spot2sample'], 
                                                                 1 / (self.gene_E * self.gene_E))],
                                                    axis = 1),
                               observed=tt.concatenate([self.y_data, self.x_data], axis = 1))

            # =====================Compute nUMI from each factor in spots  ======================= #                          
            self.nUMI_factors = pm.Deterministic('nUMI_factors',
                                                 (self.spot_factors * (self.gene_factors * self.gene_level.T).sum(0)))