Exemplo n.º 1
0
def logit_rate_from_range(rate):
    """
    calculate age-specific rates and variances
    in logit space from a Rate model object
    """
    logit_mesh = np.arange(rate.age_start, rate.age_end+1)
    pop_vals = np.array(rate.population())

    n = (rate.numerator + 1.*NEARLY_ZERO) * pop_vals / np.sum(pop_vals)
    d = (rate.denominator + 2.*NEARLY_ZERO) * pop_vals / np.sum(pop_vals)
    
    logit_rate = mc.logit(np.minimum(n/d, 1.-NEARLY_ZERO))
    logit_V = ( logit_rate - mc.logit( n/d + (n/d)*(1.-n/d)/np.sqrt(d) ) )**2.

    # filter out the points where the denominator is very close to zero
    good_mesh = []
    good_rate = []
    good_V = []
    
    for ii in range(len(logit_mesh)):
        if n[ii] > 0. and n[ii] < d[ii] and d[ii] > .01:
            good_mesh.append(logit_mesh[ii])
            good_rate.append(logit_rate[ii])
            good_V.append(logit_V[ii])
    
    return good_mesh, good_rate, good_V
Exemplo n.º 2
0
def generate_synthetic_data(truth, key, d):
    """ create simulated data"""
    a0 = d['age_start']
    a1 = d['age_end']
    age_weights = d['age_weights']
        
    d.update(condition='type_2_diabetes',
             year_start=y,
             year_end=y)

    p0 = dismod3.utils.rate_for_range(truth[key], range(a0, a1 + 1), np.ones(a1 + 1 - a0)/(a1+1-a0))
    p0 = dismod3.utils.trim(p0, 1.e-6, 1. - 1.e-6)

    # TODO: make beta dispersion study level (instead of datum level)
    # p1 = mc.rbeta(p0 * dispersion, (1 - p0) * dispersion)
    p1 = p0

    # TODO: add additional covariates
    if key.find('prevalence') != -1:
        if random.random() < .1:
            d['self-reported'] = True
            p1 = mc.invlogit(mc.logit(p1) - .2)
        else:
            d['self-reported'] = False
    
    #p2 = mc.rbinomial(n, p1) / n
    p2 = float(p1)
    
    d['value'] = p2
    d['standard_error'] = .0001

    return d
Exemplo n.º 3
0
def sim_cod_data(N, cf_rec): 
    """ 
    Create an NxJ matrix of simulated data (J is the number of causes and is determined
    by the length of cf_mean). 

    N - the number of simulations    
    cf_rec - a recarray containing: 
        cause - a list of causes 
        est - the estimates of the cause fractions
        lower - the lower bound of the cause fractions
        upper - the upper bound of the cause fractions 
    """

    # logit the mean and bounds and approximate the standard deviation in logit space
    cf_mean = mc.logit(cf_rec.est)
    cf_lower = mc.logit(cf_rec.lower)
    cf_upper = mc.logit(cf_rec.upper)
    std = (cf_upper - cf_lower)/(2*1.96)
Exemplo n.º 4
0
def sim_cod_data(N, cf_rec):
    """ 
    Create an NxJ matrix of simulated data (J is the number of causes and is determined
    by the length of cf_mean). 

    N - the number of simulations    
    cf_rec - a recarray containing: 
        cause - a list of causes 
        est - the estimates of the cause fractions
        lower - the lower bound of the cause fractions
        upper - the upper bound of the cause fractions 
    """

    # logit the mean and bounds and approximate the standard deviation in logit space
    cf_mean = mc.logit(cf_rec.est)
    cf_lower = mc.logit(cf_rec.lower)
    cf_upper = mc.logit(cf_rec.upper)
    std = (cf_upper - cf_lower) / (2 * 1.96)
Exemplo n.º 5
0
def sim_data(N,
             true_cf=[[.3, .6, .1], [.3, .5, .2]],
             true_std=[[.2, .05, .05], [.3, 0.1, 0.1]],
             sum_to_one=True):
    """ 
    Create an NxTxJ matrix of simulated data (T is determined by the length 
    of true_cf, J by the length of the elements of true_cf). 

    true_cf - a list of lists of true cause fractions (each must sum to one)
    true_std - a list of lists of the standard deviations corresponding to the true csmf's 
             for each time point. Can either be a list of length J inside a list of length
             1 (in this case, the same standard deviation is used for all time points) or 
             can be T lists of length J (in this case, the a separate standard deviation 
             is specified and used for each time point). 
    """

    if sum_to_one == True:
        assert pl.allclose(pl.sum(true_cf, 1),
                           1), 'The sum of elements of true_cf must equal 1'
    T = len(true_cf)
    J = len(true_cf[0])

    ## if only one std provided, duplicate for all time points
    if len(true_std) == 1 and len(true_cf) > 1:
        true_std = [true_std[0] for i in range(len(true_cf))]

    ## transform the mean and std to logit space
    transformed_std = []
    for t in range(T):
        pi_i = pl.array(true_cf[t])
        sigma_pi_i = pl.array(true_std[t])
        transformed_std.append(
            ((1 / (pi_i * (pi_i - 1)))**2 * sigma_pi_i**2)**0.5)

    ## find minimum standard deviation (by cause across time) and draw from this
    min = pl.array(transformed_std).min(0)
    common_perturbation = [
        pl.ones([T, J]) * mc.rnormal(mu=0, tau=min**-2) for n in range(N)
    ]

    ## draw from remaining variation
    tau = pl.array(transformed_std)**2 - min**2
    tau[tau == 0] = 0.000001
    additional_perturbation = [
        [mc.rnormal(mu=0, tau=tau[t]**-1) for t in range(T)] for n in range(N)
    ]

    result = pl.zeros([N, T, J])
    for n in range(N):
        result[n, :, :] = [
            mc.invlogit(
                mc.logit(true_cf[t]) + common_perturbation[n][t] +
                additional_perturbation[n][t]) for t in range(T)
        ]

    return result
Exemplo n.º 6
0
def sim_data(N, true_cf=[[.3, .6, .1],
                           [.3, .5, .2]],
             true_std=[[.2, .05, .05], 
                       [.3, 0.1, 0.1]],
             sum_to_one=True):
    """ 
    Create an NxTxJ matrix of simulated data (T is determined by the length 
    of true_cf, J by the length of the elements of true_cf). 

    true_cf - a list of lists of true cause fractions (each must sum to one)
    true_std - a list of lists of the standard deviations corresponding to the true csmf's 
             for each time point. Can either be a list of length J inside a list of length
             1 (in this case, the same standard deviation is used for all time points) or 
             can be T lists of length J (in this case, the a separate standard deviation 
             is specified and used for each time point). 
    """

    if sum_to_one == True: 
        assert pl.allclose(pl.sum(true_cf, 1), 1), 'The sum of elements of true_cf must equal 1' 
    T = len(true_cf)
    J = len(true_cf[0])
    
    ## if only one std provided, duplicate for all time points 
    if len(true_std)==1 and len(true_cf)>1: 
        true_std = [true_std[0] for i in range(len(true_cf))]    

    ## transform the mean and std to logit space
    transformed_std = []
    for t in range(T): 
        pi_i = pl.array(true_cf[t])
        sigma_pi_i = pl.array(true_std[t])
        transformed_std.append( ((1/(pi_i*(pi_i-1)))**2 * sigma_pi_i**2)**0.5 )
            
    ## find minimum standard deviation (by cause across time) and draw from this 
    min = pl.array(transformed_std).min(0)
    common_perturbation = [pl.ones([T,J])*mc.rnormal(mu=0, tau=min**-2) for n in range(N)]
    
    ## draw from remaining variation 
    tau=pl.array(transformed_std)**2 - min**2
    tau[tau==0] = 0.000001
    additional_perturbation = [[mc.rnormal(mu=0, tau=tau[t]**-1) for t in range(T)] for n in range(N)]

    result = pl.zeros([N, T, J])
    for n in range(N):
        result[n, :, :] = [mc.invlogit(mc.logit(true_cf[t]) + common_perturbation[n][t] + additional_perturbation[n][t]) for t in range(T)]

    return result
Exemplo n.º 7
0
    def mortality(self, key="all-cause_mortality", data=None):
        """ Calculate the all-cause mortality rate for the
        region and sex of disease_model, and return it
        in an array corresponding to age_mesh

        Parameters
        ----------
        key : str, optional
          of the form 'all-cause_mortality+gbd_region+year+sex'
        data: list, optional
          the data list to extract all-cause mortality from
        """
        if self.params.get("initial_value", {}).has_key(key):
            return self.get_initial_value(key)

        if not data:
            data = self.filter_data("all-cause_mortality data")

        if len(data) == 0:
            return NEARLY_ZERO * np.ones(len(self.get_estimate_age_mesh()))
        else:
            M, C = uninformative_prior_gp(c=-1.0, scale=300.0)
            age = []
            val = []
            V = []
            for d in data:
                scale = self.extract_units(d)
                a0 = d.get("age_start", MISSING)
                a1 = d.get("age_end", MISSING)
                y = self.value_per_1(d)
                se = self.se_per_1(d)

                if se == MISSING:
                    se = 0.01
                if MISSING in [a0, a1, y]:
                    continue

                age.append(0.5 * (a0 + a1))
                val.append(y + 0.00001)
                V.append(se ** 2.0)

            if len(data) > 0:
                gp.observe(M, C, age, mc.logit(val), V)

            normal_approx_vals = mc.invlogit(M(self.get_estimate_age_mesh()))
            self.set_initial_value(key, normal_approx_vals)
            return self.get_initial_value(key)
Exemplo n.º 8
0
def make_model(lon, lat, input_data, covariate_keys, pos, neg, cpus=1):
    """
    This function is required by the generic MBG code.
    """
    # Uniquify data locations
    data_mesh, logp_mesh, fi, ui, ti = uniquify(lon, lat)

    # Create the mean & its evaluation at the data locations.
    m = pm.Uninformative("m", 0)

    @pm.deterministic
    def M(m=m):
        return pm.gp.Mean(mean_fn, m=m)

    # The partial sill.
    amp = pm.Exponential("amp", 0.1, value=1.0)

    # The range parameter.
    scale = pm.Exponential("scale", 0.1, value=0.08)

    # This parameter controls the degree of differentiability of the field.
    diff_degree = pm.Uniform("diff_degree", 0.01, 3)

    # The nugget variance.
    V = pm.Gamma("V", 4, 40, value=0.1)
    tau = 1.0 / V

    # Create the covariance & its evaluation at the data locations.
    @pm.deterministic(trace=True)
    def C(amp=amp, scale=scale, diff_degree=diff_degree):
        """A covariance function created from the current parameter values."""
        return pm.gp.FullRankCovariance(pm.gp.cov_funs.matern.euclidean, amp=amp, scale=scale, diff_degree=diff_degree)

    # The Gaussian process submodel
    sp_sub = pm.gp.GPSubmodel("sp_sub", M, C, logp_mesh)

    # Add the nugget process
    eps_p_f = pm.Normal("eps_p_f", sp_sub.f_eval[fi], tau, value=pm.logit((pos + 1.0) / (pos + neg + 2.0)))

    # Probability of 'success'
    p = pm.Lambda("s", lambda lt=eps_p_f: invlogit(lt), trace=False)

    # The data have the 'observed' flag set to True.
    d = pm.Binomial("d", pos + neg, p, value=pos, observed=True)

    return locals()
Exemplo n.º 9
0
def values_from(dm, d, min_val=1.e-5, max_se=.1):
    """ Extract the normalized values from a piece of data

    Parameters
    ----------
    dm : disease model

    d : data dict

    min_val : float, optional
      the value to use instead of zero, since logit cannot model true zero

    max_se : float, optional
      the standard error to use for data with missing or zero standard error
    """
    est_mesh = dm.get_estimate_age_mesh()

    # get the index vector and weight vector for the age range
    age_indices = indices_for_range(est_mesh, d['age_start'], d['age_end'])
    age_weights = d.get('age_weights', np.ones(len(age_indices)))

    # ensure all rate data is valid
    d_val = dm.value_per_1(d)
    if d_val < 0 or d_val > 1:
        debug('WARNING: data %d not in range (0,1)' % d['id'])
        raise ValueError
    elif d_val == 0.:
        d_val = min_val / 10.  # TODO: determine if this is an acceptible way to deal with zero
    elif d_val == 1.:
        d_val = 1. - min_val / 10.

    logit_val = mc.logit(d_val)

    d_se = dm.se_per_1(d)
    if d_se == MISSING:
        d_se = max_se #TODO: determine if this is an acceptible way to deal with missing
    elif d_se == 0.:
        d_se = max_se

    logit_se = (1/d_val + 1/(1-d_val)) * d_se

    return age_indices, age_weights, logit_val, logit_se
Exemplo n.º 10
0
Arquivo: vzv.py Projeto: aflaxman/gbd
pl.plot(X, Y, 'ks', label='Observed', mec='w', mew=1)


XX = sm.add_constant(X)
X_pred = pl.arange(65)
XX_pred = sm.add_constant(X_pred)


model = sm.OLS(Y, XX)
results = model.fit()
Y_pred = model.predict(XX_pred)

pl.plot(X_pred, Y_pred, 'k-', linewidth=2, label='Predicted by OLS')


Y = mc.logit(df['Parameter Value'].__array__())
model = sm.OLS(Y, XX)
results = model.fit()
Y_pred = model.predict(XX_pred)

pl.plot(X_pred, mc.invlogit(Y_pred), 'k--', linewidth=2, label='Predicted by logit-transformed OLS')


pl.xlabel('Age (Years)')
pl.ylabel('Seroprevalence (Per 1)')
pl.legend(loc='lower right', fancybox=True, shadow=True)
pl.axis([-5, 55, 0, 1.2])
pl.grid()

pl.savefig('vzv_forest.pdf')
Exemplo n.º 11
0
def fit_without_confrontation(id, region, sex, year):
    """ Fit posterior of specified region/sex/year for specified model
    without trying to integrate conflicting sources of data

    Parameters
    ----------
    id : int
      The model id number for the job to fit
    region : str
      From dismod3.settings.gbd_regions, but clean()-ed
    sex : str, from dismod3.settings.gbd_sexes
    year : str, from dismod3.settings.gbd_years
    """

    ## load model
    dm = dismod3.load_disease_model(id)

    ## separate out prevalence and relative-risk data
    prev_data = [
        d for d in dm.data
        if dm.relevant_to(d, 'prevalence', region, year, sex)
    ]
    rr_data = [
        d for d in dm.data
        if dm.relevant_to(d, 'relative-risk', region, year, sex)
    ]
    dm.data = [d for d in dm.data if not d in prev_data and not d in rr_data]

    ### setup the generic disease model (without prevalence data)
    import dismod3.gbd_disease_model as model
    keys = dismod3.utils.gbd_keys(region_list=[region],
                                  year_list=[year],
                                  sex_list=[sex])
    dm.calc_effective_sample_size(dm.data)
    dm.vars = model.setup(dm, keys)

    ## override the birth prevalence prior, based on the withheld prevalence data
    logit_C_0 = dm.vars[dismod3.utils.gbd_key_for('bins', region, year,
                                                  sex)]['initial']['logit_C_0']
    assert len(prev_data) == 1, 'should be a single prevalance datum'
    d = prev_data[0]

    mu_logit_C_0 = mc.logit(dm.value_per_1(d) + dismod3.settings.NEARLY_ZERO)
    lb, ub = dm.bounds_per_1(d)
    sigma_logit_C_0 = (mc.logit(ub + dismod3.settings.NEARLY_ZERO) -
                       mc.logit(lb + dismod3.settings.NEARLY_ZERO)) / (2 *
                                                                       1.96)
    print 'mu_C_0_pri:', mc.invlogit(mu_logit_C_0)
    print 'ui_C_0_pri:', lb, ub

    # override the excess-mortality, based on the relative-risk data
    mu_rr = 1.01 * np.ones(dismod3.settings.MAX_AGE)
    sigma_rr = .01 * np.ones(dismod3.settings.MAX_AGE)
    for d in rr_data:
        mu_rr[d['age_start']:(d['age_end'] + 1)] = dm.value_per_1(d)
        sigma_rr[d['age_start']:(d['age_end'] + 1)] = dm.se_per_1(d)
    print 'mu_rr:', mu_rr.round(2)
    #print 'sigma_rr:', sigma_rr.round(2)

    log_f = dm.vars[dismod3.utils.gbd_key_for('excess-mortality', region, year,
                                              sex)]['age_coeffs']
    log_f_mesh = log_f.parents['gamma_mesh']
    param_mesh = log_f.parents['param_mesh']

    m_all = dm.vars[dismod3.utils.gbd_key_for('all-cause_mortality', region,
                                              year, sex)]
    mu_log_f = np.log((mu_rr - 1) * m_all)
    sigma_log_f = 1 / ((mu_rr - 1) * m_all) * sigma_rr * m_all
    print 'mu_log_f:', mu_log_f.round(2)[param_mesh]
    print 'sigma_log_f:', sigma_log_f.round(2)[param_mesh]

    ### fit the model using Monte Carlo simulation (shoehorned into the MCMC framework of PyMC)
    dm.mcmc = mc.MCMC(dm.vars)
    dm.mcmc.use_step_method(SampleFromNormal,
                            logit_C_0,
                            mu=mu_logit_C_0,
                            tau=sigma_logit_C_0**-2)
    dm.mcmc.use_step_method(SampleFromNormal,
                            log_f_mesh,
                            mu=mu_log_f[param_mesh],
                            tau=sigma_log_f[param_mesh]**-2)
    for stoch in dm.mcmc.stochastics:
        dm.mcmc.use_step_method(mc.NoStepper, stoch)
    dm.mcmc.sample(1000, verbose=dismod3.settings.ON_SGE)

    #print 'mu_C_0_post:', mc.invlogit(logit_C_0.stats()['mean']).round(2)
    #print 'ui_C_0_post:', mc.invlogit(logit_C_0.stats()['95% HPD interval']).round(2)
    #print 'mu_rr_post:', dm.vars[dismod3.utils.gbd_key_for('relative-risk', region, year, sex)]['rate_stoch'].stats()['mean'].round(2)
    print 'mu_log_f_mesh_post:', log_f_mesh.stats()['mean'].round(2)
    print 'mu_f_post:', dm.vars[dismod3.utils.gbd_key_for(
        'excess-mortality', region, year,
        sex)]['rate_stoch'].stats()['mean'].round(2)

    for k in keys:
        t, r, y, s = dismod3.utils.type_region_year_sex_from_key(k)

        if t in [
                'incidence', 'prevalence', 'remission', 'excess-mortality',
                'mortality', 'prevalence_x_excess-mortality'
        ]:
            dismod3.neg_binom_model.store_mcmc_fit(dm, k, dm.vars[k])

        elif t in ['relative-risk', 'duration', 'incidence_x_duration']:
            dismod3.normal_model.store_mcmc_fit(dm, k, dm.vars[k])

    from fit_posterior import save_country_level_posterior
    if str(year) == '2005':  # also generate 2010 estimates
        save_country_level_posterior(dm, region, 2010, sex,
                                     ['prevalence', 'remission'])
    save_country_level_posterior(
        dm, region, year, sex, ['prevalence', 'remission']
    )  #'prevalence incidence remission excess-mortality duration mortality relative-risk'.split())

    # save results (do this last, because it removes things from the disease model that plotting function, etc, might need
    keys = dismod3.utils.gbd_keys(region_list=[region],
                                  year_list=[year],
                                  sex_list=[sex])
    dm.save('dm-%d-posterior-%s-%s-%s.json' % (dm.id, region, sex, year),
            keys_to_save=keys)

    return dm
Exemplo n.º 12
0
def transform_bin_data(pos, neg):
    return pm.logit((pos+1.)/(pos+neg+2.))
Exemplo n.º 13
0
    
    # ===========================
    # = Create likelihood layer =
    # ===========================
        
    eps_p_f_list = []
    N_pos_list = []
    
    # Obtain the spline representation of the log of the Monte Carlo-integrated 
    # likelihood function at each datapoint. The nodes are at .01,.02,...,.98,.99 .
    junk, splreps = age_corr_likelihoods(all_pts, 10000, np.arange(.01,1.,.01), norun_name)
    for i in xrange(len(splreps)):
        splreps[i] = list(splreps[i])

    # Don't worry, these are just reasonable initial values...
    val_now = pm.logit(np.array(all_pts.PF+1,dtype=float)/(all_pts.EXAMINED+2))
    if with_stukel:
        val_now = pm.stukel_logit(np.array(all_pts.PF+1,dtype=float)/(all_pts.EXAMINED+2), a1.value, a2.value)
    
    if data_mesh.shape[0] % chunk == 0:
        additional_index = 0
    else:
        additional_index = 1
    
    for i in xrange(0,data_mesh.shape[0] / chunk + additional_index):
        
        this_slice = slice(chunk*i, min((i+1)*chunk, data_mesh.shape[0]))

        # epsilon plus f, given f.
        @pm.stochastic(trace=False, dtype=np.float)
        def eps_p_f_now(value=val_now[this_slice], f=f_eval, V=V, this_slice = this_slice):
Exemplo n.º 14
0
            print "Trying again: %s" % msg
            init_OK = False
            gc.collect()

    # ===========================
    # = Create likelihood layer =
    # ===========================

    eps_p_f_list = []
    N_pos_list = []

    # Don't worry, these are just reasonable initial values...
    if with_stukel:
        val_now = pm.stukel_logit((pos + 1.0) / (pos + neg + 2.0), a1.value, a2.value)
    else:
        val_now = pm.logit((pos + 1.0) / (pos + neg + 2.0))

    if data_mesh.shape[0] % chunk == 0:
        additional_index = 0
    else:
        additional_index = 1

    for i in xrange(0, data_mesh.shape[0] / chunk + additional_index):

        this_slice = slice(chunk * i, min((i + 1) * chunk, data_mesh.shape[0]))

        # epsilon plus f, given f.
        @pm.stochastic(trace=False, dtype=np.float)
        def eps_p_f_now(value=val_now[this_slice], f=sp_sub.f_eval, V=V, sl=this_slice):
            return pm.normal_like(value, f[fi][sl], 1.0 / V)
Exemplo n.º 15
0
    # ===========================
    # = Create likelihood layer =
    # ===========================

    eps_p_f_list = []
    N_pos_list = []

    # Obtain the spline representation of the log of the Monte Carlo-integrated
    # likelihood function at each datapoint. The nodes are at .01,.02,...,.98,.99 .
    junk, splreps = age_corr_likelihoods(all_pts, 10000,
                                         np.arange(.01, 1., .01), norun_name)
    for i in xrange(len(splreps)):
        splreps[i] = list(splreps[i])

    # Don't worry, these are just reasonable initial values...
    val_now = pm.logit(
        np.array(all_pts.PF + 1, dtype=float) / (all_pts.EXAMINED + 2))
    if with_stukel:
        val_now = pm.stukel_logit(
            np.array(all_pts.PF + 1, dtype=float) / (all_pts.EXAMINED + 2),
            a1.value, a2.value)

    if data_mesh.shape[0] % chunk == 0:
        additional_index = 0
    else:
        additional_index = 1

    for i in xrange(0, data_mesh.shape[0] / chunk + additional_index):

        this_slice = slice(chunk * i, min((i + 1) * chunk, data_mesh.shape[0]))

        # epsilon plus f, given f.
Exemplo n.º 16
0
def set_birth_prev(value):
    model.vars['logit_C0'].value = mc.logit(pl.maximum(1.e-9, value))
Exemplo n.º 17
0
def fit_without_confrontation(id, region, sex, year):
    """ Fit posterior of specified region/sex/year for specified model
    without trying to integrate conflicting sources of data

    Parameters
    ----------
    id : int
      The model id number for the job to fit
    region : str
      From dismod3.settings.gbd_regions, but clean()-ed
    sex : str, from dismod3.settings.gbd_sexes
    year : str, from dismod3.settings.gbd_years
    """

    ## load model
    dm = dismod3.load_disease_model(id)


    ## separate out prevalence and relative-risk data
    prev_data = [d for d in dm.data if dm.relevant_to(d, 'prevalence', region, year, sex)]
    rr_data = [d for d in dm.data if dm.relevant_to(d, 'relative-risk', region, year, sex)]
    dm.data = [d for d in dm.data if not d in prev_data and not d in rr_data]


    ### setup the generic disease model (without prevalence data)
    import dismod3.gbd_disease_model as model
    keys = dismod3.utils.gbd_keys(region_list=[region], year_list=[year], sex_list=[sex])
    dm.calc_effective_sample_size(dm.data)
    dm.vars = model.setup(dm, keys)


    ## override the birth prevalence prior, based on the withheld prevalence data
    logit_C_0 = dm.vars[dismod3.utils.gbd_key_for('bins', region, year, sex)]['initial']['logit_C_0']
    assert len(prev_data) == 1, 'should be a single prevalance datum'
    d = prev_data[0]

    mu_logit_C_0 = mc.logit(dm.value_per_1(d)+dismod3.settings.NEARLY_ZERO)
    lb, ub = dm.bounds_per_1(d)
    sigma_logit_C_0 = (mc.logit(ub+dismod3.settings.NEARLY_ZERO) - mc.logit(lb+dismod3.settings.NEARLY_ZERO)) / (2 * 1.96)
    print 'mu_C_0_pri:', mc.invlogit(mu_logit_C_0)
    print 'ui_C_0_pri:', lb, ub

    # override the excess-mortality, based on the relative-risk data
    mu_rr = 1.01*np.ones(dismod3.settings.MAX_AGE)
    sigma_rr = .01*np.ones(dismod3.settings.MAX_AGE)
    for d in rr_data:
        mu_rr[d['age_start']:(d['age_end']+1)] = dm.value_per_1(d)
        sigma_rr[d['age_start']:(d['age_end']+1)] = dm.se_per_1(d)
    print 'mu_rr:', mu_rr.round(2)
    #print 'sigma_rr:', sigma_rr.round(2)

    log_f = dm.vars[dismod3.utils.gbd_key_for('excess-mortality', region, year, sex)]['age_coeffs']
    log_f_mesh = log_f.parents['gamma_mesh']
    param_mesh = log_f.parents['param_mesh']
    
    m_all = dm.vars[dismod3.utils.gbd_key_for('all-cause_mortality', region, year, sex)]
    mu_log_f = np.log((mu_rr-1) * m_all)
    sigma_log_f = 1 / ((mu_rr-1) * m_all) * sigma_rr * m_all
    print 'mu_log_f:', mu_log_f.round(2)[param_mesh]
    print 'sigma_log_f:', sigma_log_f.round(2)[param_mesh]
    
    ### fit the model using Monte Carlo simulation (shoehorned into the MCMC framework of PyMC)
    dm.mcmc = mc.MCMC(dm.vars)
    dm.mcmc.use_step_method(SampleFromNormal, logit_C_0, mu=mu_logit_C_0, tau=sigma_logit_C_0**-2)
    dm.mcmc.use_step_method(SampleFromNormal, log_f_mesh, mu=mu_log_f[param_mesh], tau=sigma_log_f[param_mesh]**-2)
    for stoch in dm.mcmc.stochastics:
        dm.mcmc.use_step_method(mc.NoStepper, stoch)
    dm.mcmc.sample(1000, verbose=dismod3.settings.ON_SGE)

    #print 'mu_C_0_post:', mc.invlogit(logit_C_0.stats()['mean']).round(2)
    #print 'ui_C_0_post:', mc.invlogit(logit_C_0.stats()['95% HPD interval']).round(2)
    #print 'mu_rr_post:', dm.vars[dismod3.utils.gbd_key_for('relative-risk', region, year, sex)]['rate_stoch'].stats()['mean'].round(2)
    print 'mu_log_f_mesh_post:', log_f_mesh.stats()['mean'].round(2)
    print 'mu_f_post:', dm.vars[dismod3.utils.gbd_key_for('excess-mortality', region, year, sex)]['rate_stoch'].stats()['mean'].round(2)


    for k in keys:
        t,r,y,s = dismod3.utils.type_region_year_sex_from_key(k)

        if t in ['incidence', 'prevalence', 'remission', 'excess-mortality', 'mortality', 'prevalence_x_excess-mortality']:
            dismod3.neg_binom_model.store_mcmc_fit(dm, k, dm.vars[k])

        elif t in ['relative-risk', 'duration', 'incidence_x_duration']:
            dismod3.normal_model.store_mcmc_fit(dm, k, dm.vars[k])

    from fit_posterior import save_country_level_posterior
    if str(year) == '2005':  # also generate 2010 estimates
        save_country_level_posterior(dm, region, 2010, sex, ['prevalence', 'remission'])
    save_country_level_posterior(dm, region, year, sex, ['prevalence', 'remission'])  #'prevalence incidence remission excess-mortality duration mortality relative-risk'.split())


    # save results (do this last, because it removes things from the disease model that plotting function, etc, might need
    keys = dismod3.utils.gbd_keys(region_list=[region], year_list=[year], sex_list=[sex])
    dm.save('dm-%d-posterior-%s-%s-%s.json' % (dm.id, region, sex, year), keys_to_save=keys)

    return dm
Exemplo n.º 18
0
def plot_all_priors(model, data=None, unique=True, model_kwargs=None):
    """
	plot the priors of an HDDM model
	Input:
		data <DataFrame> - data to be plot against the priors
		unique <bool> - whether to unique each column in data before before ploting it
	"""

    #set limits for plots
    lb = {'v': -10, 'dc(1)': -5, 'z': 0.001, 'z_std': 0}
    ub = {
        'a': 4,
        't': 1,
        'v': 10,
        'z': 1,
        'sz': 1,
        'st': 1,
        'sv': 15,
        'p_outlier': 1,
        'z_trans(1)': 1,
        'z(1)': 1,
        'dc(1)': 5,
        'a_std': 5,
        'v_std': 5,
        'z_std': 0.5,
        't_std': 5,
        'dc_std': 5
    }

    #plot all priors
    n_rows = 4
    n_cols = 5
    for n_subjs in [1]:  #,2]:

        # create a model
        # h_data, _ = hddm.generate.gen_rand_data(subjs=n_subjs, size=2)
        # if model_kwargs is None:
        #     model_kwargs = {}
        # h = model(h_data, include='all', **model_kwargs)

        #h = model

        fig = plt.figure()
        plt.subplots_adjust(left=0.1,
                            right=0.9,
                            top=0.9,
                            bottom=0.1,
                            hspace=.7)

        counter = 0
        for name, node_row in model.iter_group_nodes():
            if not name in ub:  # only those listed
                continue
            if 'var' in name or 'p_outlier' in name:
                continue
            if 'trans' in name:
                trans = True
                name = name.replace('_trans', '')
            else:
                trans = False
            counter += 1
            node = node_row['node']
            print(name)
            print(node.logp)

            #plot a single proir
            ax = plt.subplot(n_rows, n_cols, counter)
            ax.set_yticklabels([])

            #generate pdf
            xlim = np.arange(lb.get(name, 0.001), ub[name], 0.01)
            pdf = np.zeros(len(xlim))
            # assume that the logp has the prior?
            for i in range(len(pdf)):
                if not trans:
                    node.value = xlim[i]
                    pdf[i] = np.exp(node.logp)
                else:
                    node.value = pm.logit(xlim[i])
                    pdf[i] = np.exp(node.logp) * 10

            #plot shit
            plt.plot(xlim, pdf)
            plt.xlabel(name)
            sns.despine(offset=2, trim=True)

            # # Hide the right and top spines
#             ax.spines['right'].set_visible(False)
#             ax.spines['top'].set_visible(False)
#
#             # Only show ticks on the left and bottom spines
#             ax.yaxis.set_ticks_position('left')
#             ax.xaxis.set_ticks_position('bottom')

#add suptitle
        plt.suptitle('HDDM priors')

# save the figure
    plt.savefig(os.path.join(mypath, 'priorPlot.pdf'))
Exemplo n.º 19
0
 def mu(invlogit_mu=rate_stoch):
     return mc.logit(invlogit_mu)
Exemplo n.º 20
0
    # Obtain the spline representation of the log of the Monte Carlo-integrated 
    # likelihood function at each datapoint. The nodes are at .01,.02,...,.98,.99 .
    splrep_fname = hashlib.sha1(lo_age.tostring()+up_age.tostring()+pos.tostring()+neg.tostring()).hexdigest()+'.pickle'
    if splrep_fname in os.listdir('.'):
        splreps = cPickle.loads(file(splrep_fname).read())
    else:
        junk, splreps = age_corr_likelihoods(lo_age, up_age, pos, neg, 10000, np.arange(.01,1.,.01), a_pred, P_trace, S_trace, F_trace)
        file(splrep_fname,'w').write(cPickle.dumps(splreps))
    for i in xrange(len(splreps)):
        splreps[i] = list(splreps[i])

    # Don't worry, these are just reasonable initial values...
    if with_stukel:
        val_now = pm.stukel_logit((pos+1.)/(pos+neg+2.), a1.value, a2.value)
    else:
        val_now = pm.logit((pos+1.)/(pos+neg+2.))
    
    if data_mesh.shape[0] % chunk == 0:
        additional_index = 0
    else:
        additional_index = 1
    
    for i in xrange(0,data_mesh.shape[0] / chunk + additional_index):
        
        this_slice = slice(chunk*i, min((i+1)*chunk, data_mesh.shape[0]))

        # epsilon plus f, given f.
        @pm.stochastic(trace=False, dtype=np.float)
        def eps_p_f_now(value=val_now[this_slice], f=sp_sub.f_eval, V=V, sl=this_slice):
            return pm.normal_like(value, f[fi][sl], 1./V)
        eps_p_f_now.__name__ = "eps_p_f%i"%i
Exemplo n.º 21
0
def set_birth_prev(value):
    model.vars['logit_C0'].value = mc.logit(pl.maximum(1.e-9, value))
Exemplo n.º 22
0
def plot_all_priors(model, data=None, unique=True, model_kwargs=None):
    """
	plot the priors of an HDDM model

	Input:
		data <DataFrame> - data to be plot against the priors
		unique <bool> - whether to unique each column in data before before ploting it
	"""

    #set limits for plots
    lb = {'v': -10}
    ub = {
        'a': 4,
        't': 1,
        'v': 10,
        'z': 1,
        'sz': 1,
        'st': 1,
        'sv': 15,
        'p_outlier': 1
    }

    #plot all priors
    n_rows = 2
    n_cols = 4
    for n_subjs in [1, 2]:

        #create a model
        h_data, _ = hddm.generate.gen_rand_data(subjs=n_subjs, size=2)
        if model_kwargs is None:
            model_kwargs = {}
        h = model(h_data, include='all', **model_kwargs)

        fig = plt.figure()
        counter = 0
        for name, node_row in h.iter_group_nodes():
            if 'var' in name:
                continue
            if 'trans' in name:
                trans = True
                name = name.replace('_trans', '')
            else:
                trans = False
            counter += 1
            node = node_row['node']

            #plot a single proir
            ax = plt.subplot(n_rows, n_cols, counter)

            #if data is given then plot it
            if data is not None:
                try:
                    if unique:
                        t_data = data[name].dropna().unique()
                    else:
                        t_data = data[name].dropna().values

                    # if name == 'v':
                    # t_data = np.concatenate((t_data, -t_data))
                    ax.hist(t_data, 20, normed=True)
                except KeyError:
                    pass

            #generate pdf
            xlim = arange(lb.get(name, 0.001), ub[name], 0.01)
            pdf = np.zeros(len(xlim))
            for i in range(len(pdf)):
                if not trans:
                    node.value = xlim[i]
                    pdf[i] = np.exp(node.logp)
                else:
                    node.value = pm.logit(xlim[i])
                    pdf[i] = np.exp(node.logp) * 10

            #plot shit
            plt.plot(xlim, pdf)
            plt.title(name)

        #add suptitle
        if n_subjs > 1:
            plt.suptitle('Group model')
        else:
            plt.suptitle('Subject model')
Exemplo n.º 23
0
def initial_guess(treatment):
    return np.median(logit((pos_counts[treatment_ids == treatment] + 1).astype(float) / (total_counts[treatment_ids == treatment] + 2)))
Exemplo n.º 24
0
def make_model(N,k,X,backend,manifold):
    """
    A standard spatial logistic regression.
    - N: Number sampled at each location
    - k: Number positive at each location
    - X: x,y,z coords of each location
    - Backend: The linear algebra backend. So far, this has to be 'cholmod'. 
    - manifold: The manifold to work on. So far, this has to be 'spherical'.
    """
    
    # Make the Delaunay triangulation.
    neighbors, triangles, trimap, b = manifold.triangulate_sphere(X)

    # Uncomment to visualize the triangulation.
    # manifold.plot_triangulation(X,neighbors)

    # Generate the C, Ctilde and G matrix in SciPy 'lil' format.
    triangle_areas = [manifold.triangle_area(X, t) for t in triangles]
    Ctilde = manifold.Ctilde(X, triangles, triangle_areas)
    C = manifold.C(X, triangles, triangle_areas)
    G = manifold.G(X, triangles, triangle_areas)

    # Convert to SciPy 'csc' format for efficient use by the CHOLMOD backend.
    C = backend.into_matrix_type(C)
    Ctilde = backend.into_matrix_type(Ctilde)
    G = backend.into_matrix_type(G)

    # Kappa is the scale parameter. It's a free variable.
    kappa = pm.Exponential('kappa',1,value=3)

    # Fix the value of alpha.
    alpha = 2.

    # amp is the overall amplitude. It's a free variable that will probably be highly confounded with kappa.
    amp = pm.Exponential('amp', .0001, value=100)

    # A constant mean.
    m = pm.Uninformative('m',value=0)
    
    @pm.deterministic(trace=False)
    def M(m=m,n=len(X)):
        """The mean vector"""
        return np.ones(n)*m
        
    @pm.deterministic(trace=False)
    def Q(kappa=kappa, alpha=alpha, amp=amp, Ctilde=Ctilde, G=G, backend=backend):
        "The precision matrix."
        out = operators.mod_frac_laplacian_precision(Ctilde, G, kappa, alpha, backend)/np.asscalar(amp)**2
        return out

    # Do all the precomputation you can based on the sparsity pattern alone.
    # Note that if alpha is made free, this needs to be free also, as the sparsity
    # pattern will be changeable.
    pattern_products = backend.pattern_to_products(Q.value)

    @pm.deterministic(trace=False)
    def precision_products(Q=Q, p=pattern_products):
        "All the analysis of the precision matrix that the backend needs to do MVN computations."
        try: 
            return backend.precision_to_products(Q, **p)
        except backend.NonPositiveDefiniteError:
            return None

    # The random field.
    empirical_S = pm.logit((k+1)/(N+2.))
    S=pymc_objects.SparseMVN('S',M, precision_products, backend, value=empirical_S)
    
    @pm.deterministic(trace=False)
    def p(S=S):
        """The success probability."""
        return pm.invlogit(S)

    # The data.
    data = pm.Binomial('data', n=N, p=p, value=k, observed=True)
    
    # A Fortran representation of the likelihood, to allow for fast Metropolis steps without querying data.logp.
    likelihood_variables = np.vstack((np.resize(N,k.shape),k)).T
    likelihood_string = """
    lkp = dexp({X})/(1.0D0+dexp({X}))
    lkp = lv(i,2)*dlog(lkp) + (lv(i,1)-lv(i,2))*dlog(1.0D0-lkp)
    """
    
    return locals()
Exemplo n.º 25
0
 def pripred_check(m=m,amp=amp,V=V):
     p_above = scipy.stats.distributions.norm.cdf(m-pm.logit(threshold_val), 0, np.sqrt(amp**2+V))
     if p_above <= max_p_above:
         return 0.
     else:
         return -np.inf
Exemplo n.º 26
0
def setup(dm, key, data_list, rate_stoch=None, emp_prior={}):
    """ Generate the PyMC variables for a beta binomial model of
    a single rate function

    Parameters
    ----------
    dm : dismod3.DiseaseModel
      the object containing all the data, priors, and additional
      information (like input and output age-mesh)
      
    key : str
      the name of the key for everything about this model (priors,
      initial values, estimations)

    data_list : list of data dicts
      the observed data to use in the beta-binomial liklihood function

    rate_stoch : pymc.Stochastic, optional
      a PyMC stochastic (or deterministic) object, with
      len(rate_stoch.value) == len(dm.get_estimation_age_mesh()).
      This is used to link beta-binomial stochs into a larger model,
      for example.

    emp_prior : dict, optional
      the empirical prior dictionary, retrieved from the disease model
      if appropriate by::

          >>> t, r, y, s = type_region_year_sex_from_key(key)
          >>> emp_prior = dm.get_empirical_prior(t)
      

    Results
    -------
    vars : dict
      Return a dictionary of all the relevant PyMC objects for the
      beta binomial model.  vars['rate_stoch'] is of particular
      relevance; this is what is used to link the beta-binomial model
      into more complicated models, like the generic disease model.

    Details
    -------
    The beta binomial model parameters are the following:
      * the mean age-specific rate function
      * dispersion of this mean
      * the p_i value for each data observation that has a standard
        error (data observations that do not have standard errors
        recorded are fit as observations of the beta r.v., while
        observations with standard errors recorded have a latent
        variable for the beta, and an observed binomial r.v.).
    """
    vars = {}
    est_mesh = dm.get_estimate_age_mesh()
    if np.any(np.diff(est_mesh) != 1):
        raise ValueError, "ERROR: Gaps in estimation age mesh must all equal 1"

    # set up age-specific rate function, if it does not yet exist
    if not rate_stoch:
        param_mesh = dm.get_param_age_mesh()

        if emp_prior.has_key("mu"):
            initial_value = emp_prior["mu"]
        else:
            initial_value = dm.get_initial_value(key)

        # find the logit of the initial values, which is a little bit
        # of work because initial values are sampled from the est_mesh,
        # but the logit_initial_values are needed on the param_mesh
        logit_initial_value = mc.logit(interpolate(est_mesh, initial_value, param_mesh))

        logit_rate = mc.Normal(
            "logit(%s)" % key, mu=-5.0 * np.ones(len(param_mesh)), tau=1.0e-2, value=logit_initial_value
        )
        # logit_rate = [mc.Normal('logit(%s)_%d' % (key, a), mu=-5., tau=1.e-2) for a in param_mesh]
        vars["logit_rate"] = logit_rate

        @mc.deterministic(name=key)
        def rate_stoch(logit_rate=logit_rate):
            return interpolate(param_mesh, mc.invlogit(logit_rate), est_mesh)

    if emp_prior.has_key("mu"):

        @mc.potential(name="empirical_prior_%s" % key)
        def emp_prior_potential(f=rate_stoch, mu=emp_prior["mu"], tau=1.0 / np.array(emp_prior["se"]) ** 2):
            return mc.normal_like(f, mu, tau)

        vars["empirical_prior"] = emp_prior_potential

    vars["rate_stoch"] = rate_stoch

    # create stochastic variable for over-dispersion "random effect"
    mu_od = emp_prior.get("dispersion", 0.001)
    dispersion = mc.Gamma("dispersion_%s" % key, alpha=10.0, beta=10.0 / mu_od)
    vars["dispersion"] = dispersion

    @mc.deterministic(name="alpha_%s" % key)
    def alpha(rate=rate_stoch, dispersion=dispersion):
        return rate / dispersion ** 2

    @mc.deterministic(name="beta_%s" % key)
    def beta(rate=rate_stoch, dispersion=dispersion):
        return (1.0 - rate) / dispersion ** 2

    vars["alpha"] = alpha
    vars["beta"] = beta

    # create potentials for priors
    vars["priors"] = generate_prior_potentials(dm.get_priors(key), est_mesh, rate_stoch, dispersion)

    # create latent and observed stochastics for data
    vars["data"] = data_list
    vars["ab"] = []
    vars["latent_p"] = []
    vars["observations"] = []

    for d in data_list:
        # set up observed stochs for all relevant data
        id = d["id"]

        if d["value"] == MISSING:
            print "WARNING: data %d missing value" % id
            continue

        # ensure all rate data is valid
        d_val = dm.value_per_1(d)
        d_se = dm.se_per_1(d)

        if d_val < 0 or d_val > 1:
            print "WARNING: data %d not in range [0,1]" % id
            continue

        if d["age_start"] < est_mesh[0] or d["age_end"] > est_mesh[-1]:
            raise ValueError, "Data %d is outside of estimation range---([%d, %d] is not inside [%d, %d])" % (
                d["id"],
                d["age_start"],
                d["age_end"],
                est_mesh[0],
                est_mesh[-1],
            )

        age_indices = indices_for_range(est_mesh, d["age_start"], d["age_end"])
        age_weights = d["age_weights"]

        @mc.deterministic(name="a_%d^%s" % (id, key))
        def a_i(alpha=alpha, age_indices=age_indices, age_weights=age_weights):
            return rate_for_range(alpha, age_indices, age_weights)

        @mc.deterministic(name="b_%d^%s" % (id, key))
        def b_i(beta=beta, age_indices=age_indices, age_weights=age_weights):
            return rate_for_range(beta, age_indices, age_weights)

        vars["ab"] += [a_i, b_i]

        if d_se > 0:
            # if the data has a standard error, model it as a realization
            # of a beta binomial r.v.
            latent_p_i = mc.Beta(
                "latent_p_%d^%s" % (id, key), alpha=a_i, beta=b_i, value=trim(d_val, NEARLY_ZERO, 1 - NEARLY_ZERO)
            )
            vars["latent_p"].append(latent_p_i)

            denominator = d_val * (1 - d_val) / d_se ** 2.0
            numerator = d_val * denominator
            obs_binomial = mc.Binomial(
                "data_%d^%s" % (id, key), value=numerator, n=denominator, p=latent_p_i, observed=True
            )
            vars["observations"].append(obs_binomial)
        else:
            # if the data is a point estimate with no uncertainty
            # recorded, model it as a realization of a beta r.v.
            obs_p_i = mc.Beta(
                "latent_p_%d" % id, value=trim(d_val, NEARLY_ZERO, 1 - NEARLY_ZERO), alpha=a_i, beta=b_i, observed=True
            )
            vars["observations"].append(obs_p_i)

    return vars
Exemplo n.º 27
0
            key = dismod3.gbd_key_for('%s', region, year, sex)

            if clean(region) == 'north_america_high_income':
                regional_offset = 0.
            else:
                regional_offset = -.5

            time_offset = (int(year)-1997)/10.

            if clean(sex) == 'male':
                sex_offset = .1
            else:
                sex_offset = 0.
            
            # incidence rate
            i = mc.invlogit(mc.logit(.012 * mc.invlogit((ages - 44) / 3)) + regional_offset + time_offset + sex_offset)
            truth[key % 'incidence'] = i

            # remission rate
            r = 0. * ages
            truth[key % 'remission'] = r

            # excess-mortality rate
            f = .085 * (ages / 100) ** 2.5
            truth[key % 'excess-mortality'] = f

            ## compartmental model (bins S, C, D, M)
            SCDM = np.zeros([4, age_len])
            SCDM[0,0] = 1.

            for a in range(age_len - 1):
Exemplo n.º 28
0
def make_model(lon,lat,input_data,covariate_keys,pos,neg):
    """
    This function is required by the generic MBG code.
    """
    
    # How many nuggeted field points to handle with each step method
    grainsize = 10

    # Unique data locations
    data_mesh, logp_mesh, fi, ui, ti = uniquify(lon, lat)
    
    s_hat = (pos+1.)/(pos+neg+2.)
        
    # The partial sill.
    amp = pm.Exponential('amp', .1, value=1.4)

    # The range parameters. Units are RADIANS. 
    # 1 radian = the radius of the earth, about 6378.1 km
    scale = pm.Exponential('scale', .1, value=.07)
    @pm.potential
    def scale_constraint(scale=scale):
        if scale>.5:
            return -np.inf
        else:
            return 0

    # This parameter controls the degree of differentiability of the field.
    diff_degree = pm.Uniform('diff_degree', .01, 3, value=0.5, observed=True)

    # The nugget variance.
    V = pm.Exponential('V', .1, value=1)
    # @pm.potential
    # def V_constraint(V=V):
    #     if V<.1:
    #         return -np.inf
    #     else:
    #         return 0

    a0 = pm.Normal('a0',0,.1,value=0,observed=True)
    # a1 limits mixing.
    a1 = pm.Normal('a1',0,.1,value=0,observed=True)
    a = pm.Lambda('a',lambda a0=a0,a1=a1: [a0,a1])

    m = pm.Uninformative('m',value=-13)
    @pm.deterministic(trace=False)
    def M(m=m):
        return pm.gp.Mean(mean_fn, m=m)
    
    if constrained:
        @pm.potential
        def pripred_check(m=m,amp=amp,V=V,a=a):
            p_above = scipy.stats.distributions.norm.cdf(m-pm.stukel_logit(threshold_val,*a), 0, np.sqrt(amp**2+V))
            if p_above <= max_p_above:
                return 0.
            else:
                return -np.inf

    # Create the covariance & its evaluation at the data locations.
    facdict = dict([(k,1.e6) for k in covariate_keys])
    facdict['m'] = 0
    @pm.deterministic(trace=False)
    def C(amp=amp, scale=scale, diff_degree=diff_degree, ck=covariate_keys, id=input_data, ui=ui, facdict=facdict):
        """A covariance function created from the current parameter values."""
        eval_fn = CovarianceWithCovariates(pm.gp.matern.geo_rad, id, ck, ui, fac=facdict)
        return pm.gp.FullRankCovariance(eval_fn, amp=amp, scale=scale, diff_degree=diff_degree)

    sp_sub = pm.gp.GPSubmodel('sp_sub', M, C, logp_mesh, tally_f=False)
            
    # Make f start somewhere a bit sane
    sp_sub.f_eval.value = sp_sub.f_eval.value - np.mean(sp_sub.f_eval.value)

    # Loop over data clusters
    eps_p_f_d = []
    s_d = []
    data_d = []

    for i in xrange(len(pos)/grainsize+1):
        sl = slice(i*grainsize,(i+1)*grainsize,None)        
        if len(pos[sl])>0:
            # Nuggeted field in this cluster
            eps_p_f_d.append(pm.Normal('eps_p_f_%i'%i, sp_sub.f_eval[fi[sl]], 1./V, value=pm.logit(s_hat[sl]), trace=False))            

            # The allele frequency
            s_d.append(pm.Lambda('s_%i'%i,lambda lt=eps_p_f_d[-1], a=a: pm.flib.stukel_invlogit(lt, *a),trace=False))

            # The observed allele frequencies
            data_d.append(pm.Binomial('data_%i'%i, pos[sl]+neg[sl], s_d[-1], value=pos[sl], observed=True))
    
    # The field plus the nugget
    @pm.deterministic
    def eps_p_f(eps_p_fd = eps_p_f_d):
        """Concatenated version of eps_p_f, for postprocessing & Gibbs sampling purposes"""
        return np.hstack(eps_p_fd)
            
    return locals()
Exemplo n.º 29
0
Y = df['Parameter Value'].__array__()
X = .5 * (df['Age Start'] + df['Age End']).__array__()
pl.plot(X, Y, 'ks', label='Observed', mec='w', mew=1)

XX = sm.add_constant(X)
X_pred = pl.arange(65)
XX_pred = sm.add_constant(X_pred)

model = sm.OLS(Y, XX)
results = model.fit()
Y_pred = model.predict(XX_pred)

pl.plot(X_pred, Y_pred, 'k-', linewidth=2, label='Predicted by OLS')

Y = mc.logit(df['Parameter Value'].__array__())
model = sm.OLS(Y, XX)
results = model.fit()
Y_pred = model.predict(XX_pred)

pl.plot(X_pred,
        mc.invlogit(Y_pred),
        'k--',
        linewidth=2,
        label='Predicted by logit-transformed OLS')

pl.xlabel('Age (Years)')
pl.ylabel('Seroprevalence (Per 1)')
pl.legend(loc='lower right', fancybox=True, shadow=True)
pl.axis([-5, 55, 0, 1.2])
pl.grid()
Exemplo n.º 30
0
def make_model(lon,lat,covariate_values,pos,neg,cpus=1):
    """
    This function is required by the generic MBG code.
    """
    
    # How many nuggeted field points to handle with each step method
    grainsize = 10
        
    # Non-unique data locations
    data_mesh = combine_spatial_inputs(lon, lat)
    
    s_hat = (pos+1.)/(pos+neg+2.)
    
    # Uniquify the data locations.
    locs = [(lon[0], lat[0])]
    fi = [0]
    ui = [0]
    for i in xrange(1,len(lon)):

        # If repeat location, add observation
        loc = (lon[i], lat[i])
        if loc in locs:
            fi.append(locs.index(loc))

        # Otherwise, new obs
        else:
            locs.append(loc)
            fi.append(max(fi)+1)
            ui.append(i)
    fi = np.array(fi)
    ti = [np.where(fi == i)[0] for i in xrange(max(fi)+1)]
    ui = np.asarray(ui)

    lon = np.array(locs)[:,0]
    lat = np.array(locs)[:,1]

    # Unique data locations
    logp_mesh = combine_spatial_inputs(lon,lat)
    
    # Create the mean & its evaluation at the data locations.
    M, M_eval = trivial_means(logp_mesh)

    init_OK = False
    while not init_OK:
        try:        
            # Space-time component
            sp_sub = ibd_covariance_submodel()    
            covariate_dict, C_eval = cd_and_C_eval(covariate_values, sp_sub['C'], data_mesh, ui)

            # The field evaluated at the uniquified data locations            
            f = pm.MvNormalCov('f', M_eval, C_eval)
            # Make f start somewhere a bit sane
            f.value = f.value - np.mean(f.value)
        
            # Loop over data clusters
            eps_p_f_d = []
            s_d = []
            data_d = []

            for i in xrange(len(pos)/grainsize+1):
                sl = slice(i*grainsize,(i+1)*grainsize,None)
                # Nuggeted field in this cluster
                eps_p_f_d.append(pm.Normal('eps_p_f_%i'%i, f[fi[sl]], 1./sp_sub['V'], value=pm.logit(s_hat[sl]),trace=False))

                # The allele frequency
                s_d.append(pm.Lambda('s_%i'%i,lambda lt=eps_p_f_d[-1]: invlogit(lt),trace=False))

                # The observed allele frequencies
                data_d.append(pm.Binomial('data_%i'%i, pos[sl]+neg[sl], s_d[-1], value=pos[sl], observed=True))
            
            # The field plus the nugget
            @pm.deterministic
            def eps_p_f(eps_p_fd = eps_p_f_d):
                """Concatenated version of eps_p_f, for postprocessing & Gibbs sampling purposes"""
                return np.concatenate(eps_p_fd)
            
            init_OK = True
        except pm.ZeroProbability, msg:
            print 'Trying again: %s'%msg
            init_OK = False
            gc.collect()
Exemplo n.º 31
0
def plot_all_priors(model, data=None, unique=True, model_kwargs=None):
	"""
	plot the priors of an HDDM model

	Input:
		data <DataFrame> - data to be plot against the priors
		unique <bool> - whether to unique each column in data before before ploting it
	"""

	#set limits for plots
	lb = {'v': -10}
	ub = {'a': 4, 't':1, 'v':10, 'z':1, 'sz': 1, 'st':1, 'sv':15, 'p_outlier': 1}

	#plot all priors
	n_rows=4
	n_cols=2
	for n_subjs in [1]: #,2]:

		#create a model
		h_data, _ = hddm.generate.gen_rand_data(subjs=n_subjs, size=2)
		if model_kwargs is None:
			model_kwargs = {}
		h = model(h_data, include='all', **model_kwargs)

		fig = plt.figure()
                plt.subplots_adjust(left=0.1, right=0.9, top=0.9, bottom=0.1, hspace=.7)

		counter = 0
		for name, node_row in h.iter_group_nodes():
			if 'var' in name or 'p_outlier' in name:
				continue
			if 'trans' in name:
				trans = True
				name = name.replace('_trans','')
			else:
				trans = False
			counter += 1
			node = node_row['node']

			#plot a single proir
			ax = plt.subplot(n_rows, n_cols, counter)
                        ax.set_yticklabels([])
			#if data is given then plot it
			if data is not None:
				try:
					if unique:
						t_data = data[name].dropna().unique()
					else:
						t_data = data[name].dropna().values

					# if name == 'v':
						# t_data = np.concatenate((t_data, -t_data))
					ax.hist(t_data, 20, normed=True)
				except KeyError:
					pass

			#generate pdf
			xlim = np.arange(lb.get(name, 0.001), ub[name], 0.01)
			pdf = np.zeros(len(xlim))
			for i in range(len(pdf)):
				if not trans:
					node.value = xlim[i]
					pdf[i] = np.exp(node.logp)
				else:
					node.value = pm.logit(xlim[i])
					pdf[i] = np.exp(node.logp)*10

			#plot shit
			plt.plot(xlim, pdf)
			plt.title(name)

		#add suptitle
		if n_subjs > 1:
			plt.suptitle('Group model')
		else:
			plt.suptitle('HDDM Informative model')
Exemplo n.º 32
0
### @export 'more-remission'
reload(book_graphics)
for i, k_i in enumerate(model.parameters[t]['parameter_age_mesh']):
    model.vars['f']['gamma'][i].value = pl.log(k_i*.005 + .001)
book_graphics.plot_age_patterns(model, types='i r m f p'.split(), xticks=[0,50,100],
                                yticks=dict(i=[0,.01,.02], r=[0,.05,.1], m=[0,.2,.4], f=[0,.3,.6], p=[0,.01,.02]),
                                panel='a')
pl.subplots_adjust(wspace=.5)
pl.savefig('book/graphics/more-excess-mortality.pdf')
# <codecell>

### @export 'birth_prevalence'

p_0 = .015
model.vars['logit_C0'].value = mc.logit(p_0)
p = model.vars['p']['mu_age'].value

print """
 For a condition with prevalence of
  %.1f\\%% at age $0$, these rates yield a prevalence age pattern which is
  highly nonlinear, dipping to a minimum of %.1f\\%% at age %d, and then
  increasing back up to %.1f\\%% at the oldest ages.
""" % (p_0*100, p.min()*100, p.argmin(), p[-1]*100)

book_graphics.plot_age_patterns(model, types='i r m f p'.split(), xticks=[0,50,100],
                                yticks=dict(i=[0,.01,.02], r=[0,.05,.1], m=[0,.2,.4], f=[0,.3,.6], p=[.01,.015,.02]),
                                panel='b')

pl.savefig('book/graphics/birth-prevalence.pdf')
# <codecell>