Пример #1
0
def test_simulated_disease():
    """ Test fit for simulated disease data"""

    # load model to test fitting
    dm = DiseaseJson(file('tests/test_disease_1.json').read())

    # filter and noise up data
    cov = .5
    
    data = []
    for d in dm.data:
        d['truth'] = d['value']
        if dismod3.utils.clean(d['gbd_region']) == 'north_america_high_income':
            if d['data_type'] == 'all-cause mortality data':
                data.append(d)
            else:
                se = (cov * d['value'])
                d['value'] = mc.rtruncnorm(d['truth'], se**-2, 0, np.inf)
                d['age_start'] -= 5
                d['age_end'] = d['age_start']+9
                d['age_weights'] = np.ones(d['age_end']-d['age_start']+1)
                d['age_weights'] /= float(len(d['age_weights']))

                d['standard_error'] = se

                data.append(d)

    dm.data = data
    
    # fit empirical priors and compare fit to data
    from dismod3 import neg_binom_model
    for rate_type in 'prevalence incidence remission excess-mortality'.split():
        neg_binom_model.fit_emp_prior(dm, rate_type, '/dev/null')
        check_emp_prior_fits(dm)


    # fit posterior
    delattr(dm, 'vars')  # remove vars so that gbd_disease_model creates its own version
    from dismod3 import gbd_disease_model
    keys = dismod3.utils.gbd_keys(region_list=['north_america_high_income'],
                                  year_list=[1990], sex_list=['male'])
    gbd_disease_model.fit(dm, method='map', keys=keys, verbose=1)     ## first generate decent initial conditions
    gbd_disease_model.fit(dm, method='mcmc', keys=keys, iter=1000, thin=5, burn=5000, verbose=1, dbname='/dev/null')     ## then sample the posterior via MCMC


    print 'error compared to the noisy data (coefficient of variation = %.2f)' % cov
    check_posterior_fits(dm)


    for d in dm.data:
        d['value'] = d['truth']
        d['age_start'] += 5
        d['age_end'] = d['age_start']
        d['age_weights'] = np.ones(d['age_end']-d['age_start']+1)
        d['age_weights'] /= float(len(d['age_weights']))

    print 'error compared to the truth'
    check_posterior_fits(dm)

    return dm
Пример #2
0
 def step(self):
     
     # The right-hand sides for the linear constraints
     self.rhs = dict(zip(self.constraint_offdiags, 
                         [np.asarray(np.dot(pm.utils.value(od), self.g.value)).squeeze() for od in self.constraint_offdiags]))
     
     for i in xrange(self.n):
         
         try:
             lb, ub, rhs = self.get_bounds(i)
         except ConstraintError:
             warnings.warn('Bounds could not be set, this element is very highly constrained')
             continue
         
         newgs = np.hstack((self.g.value[i], pm.rtruncnorm(0,1,lb,ub,size=self.n_draws)))
         lpls = np.hstack((self.get_likelihood_only(), np.empty(self.n_draws)))
         for j, newg in enumerate(newgs[1:]):
             self.set_g_value(newg, i)
             # The newgs are drawn from the prior, taking the canstraints into account, so 
             # accept them based on the 'likelihood children' only.
             try:
                 lpls[j+1] = self.get_likelihood_only()
             except pm.ZeroProbability:
                 lpls[j+1] = -np.inf
         
         lpls -= pm.flib.logsum(lpls)
         newg = newgs[pm.rcategorical(np.exp(lpls))]
         self.set_g_value(newg, i)
                 
         for od in self.constraint_offdiags:
             rhs[od] += np.asarray(pm.utils.value(od))[:,i].squeeze() * newg
             self.rhs = rhs
Пример #3
0
 def propose(self):
     tau = 1. / (self.adaptive_scale_factor * self.proposal_sd) ** 2
     self.stochastic.value = pm.rtruncnorm(
         self.stochastic.value,
         tau,
         self.low_bound,
         self.up_bound)
Пример #4
0
    def step(self):
        
        # TODO: Propose from not the prior, and tune using the asf's.
        # The right-hand sides for the linear constraints
        self.rhs = dict(zip(self.constraint_offdiags, 
                            [np.asarray(np.dot(pm.utils.value(od), self.g.value)).squeeze() for od in self.constraint_offdiags]))
        this_round = np.zeros(self.n, dtype='int')

        for i in xrange(self.n):
            self.check_constraints()
            # Jump an element of g.
            lb, ub, rhs = self.get_bounds(i)
            
            # Propose a new value
            curg = self.g.value[i]
            tau = 1./self.adaptive_scale_factor[i]**2
            newg = pm.rtruncnorm(curg,tau,lb,ub)[0]
            
            # The Hastings factor
            hf = pm.truncnorm_like(curg,newg,tau,lb,ub)-pm.truncnorm_like(newg,curg,tau,lb,ub)
            
            # The difference in prior log-probabilities of g
            dpri = .5*(curg**2 - newg**2)
            
            # Get the current log-likelihood of the non-constraint children.
            lpl = self.get_likelihood_only()

            cv = {}
            for od in self.all_offdiags:
                for c in od.children:
                    cv[c] = c.value.copy()

            # Inter the proposed value and get the proposed log-likelihood.
            self.set_g_value(newg, i) 
            try:
                lpl_p = self.get_likelihood_only()
            except pm.ZeroProbability:
                self.reject(i, cv)
                self.check_constraints()
                this_round[i] = -1
                continue
            
            # M-H acceptance
            if np.log(np.random.random()) < lpl_p - lpl + hf + dpri:
                self.accepted[i] += 1
                this_round[i] = 1
                for od in self.constraint_offdiags:
                    rhs[od] += np.asarray(pm.utils.value(od))[:,i].squeeze() * newg
                self.rhs = rhs
                self.check_constraints()
            else:
                self.reject(i, cv)
                self.check_constraints()
                this_round[i] = -1
Пример #5
0
def generate_and_append_data(data, data_type, truth, age_intervals, condition,
                             gbd_region, country, year, sex, effective_sample_size, cov=0.):
    """ create simulated data"""
    for a0, a1 in age_intervals:
        d = { 'condition': condition,
              'data_type': data_type,
              'gbd_region': gbd_region,
              'region': country,
              'year_start': year,
              'year_end': year,
              'sex': sex,
              'age_start': a0,
              'age_end': a1,
              'id': len(data),}

        holdout = 0
        d['ignore'] = holdout
        d['test_set'] = holdout

        ages = range(a0, a1 + 1)

        if data_type == 'incidence_x_duration':
            pop = 1. * np.ones_like(ages)
        else:
            pop = np.array([population_by_age[(country, str(year), sex)][a] for a in ages])
            if np.sum(pop) > 0:
                pop /= float(np.sum(pop))  # normalize the pop weights to sum to 1
            else:
                pop = np.ones_like(ages) / float(len(ages))  # for countries where pop is zero, fill in constant structure
        d['age_weights'] = list(pop)

        p0 = dismod3.utils.rate_for_range(truth, ages, pop)
        d['truth'] = p0

        if p0 == 0 or cov == 0:
            p1 = p0
            d['value'] = p1
            d['effective_sample_size'] = effective_sample_size
        else:
            p1 = mc.rtruncnorm(p0, (cov/100. * p0)**-2, 0, np.inf)
            assert not np.isnan(p1)

            d['value'] = p1
            d['standard_error'] = p0 * cov/100.
        data.append(d)
Пример #6
0
def generate_and_append_data(data,
                             data_type,
                             truth,
                             age_intervals,
                             condition,
                             gbd_region,
                             country,
                             year,
                             sex,
                             effective_sample_size,
                             cov=0.):
    """ create simulated data"""
    for a0, a1 in age_intervals:
        d = {
            'condition': condition,
            'data_type': data_type,
            'gbd_region': gbd_region,
            'region': country,
            'year_start': year,
            'year_end': year,
            'sex': sex,
            'age_start': a0,
            'age_end': a1,
            'id': len(data),
        }

        holdout = 0
        d['ignore'] = holdout
        d['test_set'] = holdout

        ages = range(a0, a1 + 1)

        if data_type == 'incidence_x_duration':
            pop = 1. * np.ones_like(ages)
        else:
            pop = np.array([
                population_by_age[(country, str(year), sex)][a] for a in ages
            ])
            if np.sum(pop) > 0:
                pop /= float(
                    np.sum(pop))  # normalize the pop weights to sum to 1
            else:
                pop = np.ones_like(ages) / float(
                    len(ages)
                )  # for countries where pop is zero, fill in constant structure
        d['age_weights'] = list(pop)

        p0 = dismod3.utils.rate_for_range(truth, ages, pop)
        d['truth'] = p0

        if p0 == 0 or cov == 0:
            p1 = p0
            d['value'] = p1
            d['effective_sample_size'] = effective_sample_size
        else:
            p1 = mc.rtruncnorm(p0, (cov / 100. * p0)**-2, 0, np.inf)
            assert not np.isnan(p1)

            d['value'] = p1
            d['standard_error'] = p0 * cov / 100.
        data.append(d)
Пример #7
0
 def propose(self):
     tau = 1. / (self.adaptive_scale_factor * self.proposal_sd)**2
     self.stochastic.value = pm.rtruncnorm(self.stochastic.value, tau,
                                           self.low_bound, self.up_bound)
Пример #8
0
def fit_simulated_disease(n=300, cv=2.):
    """ Test fit for simulated disease data with noise and missingness"""

    # load model to test fitting
    dm = DiseaseJson(file('tests/simulation_gold_standard.json').read())
    
    # adjust any priors and covariates as desired
    dm.set_param_age_mesh(arange(0,101,2))
    for type in 'incidence prevalence remission excess_mortality'.split():
        dm.params['global_priors']['heterogeneity'][type] = 'Very'
        dm.params['covariates']['Country_level']['LDI_id']['rate']['value'] = 0
    
    # filter and noise up data
    mort_data = []
    all_data = []
    for d in dm.data:
        d['truth'] = d['value']
        d['age_weights'] = array([1.])
        if d['data_type'] == 'all-cause mortality data':
            mort_data.append(d)
        else:
            if d['value'] > 0:
                se = (cv / 100.) * d['value']
                Y_i = mc.rtruncnorm(d['truth'], se**-2, 0, np.inf)
                d['value'] = Y_i
                d['standard_error'] = se
                d['effective_sample_size'] = Y_i * (1-Y_i) / se**2


            all_data.append(d)
    sampled_data = random.sample(all_data, n) + mort_data
    dm.data = sampled_data

    # fit empirical priors and compare fit to data
    from dismod3 import neg_binom_model
    for rate_type in 'prevalence incidence remission excess-mortality'.split():
        #neg_binom_model.fit_emp_prior(dm, rate_type, iter=1000, thin=1, burn=0, dbname='/dev/null')
        neg_binom_model.fit_emp_prior(dm, rate_type, iter=30000, thin=15, burn=15000, dbname='/dev/null')
        check_emp_prior_fits(dm)


    # fit posterior
    delattr(dm, 'vars')  # remove vars so that gbd_disease_model creates its own version
    from dismod3 import gbd_disease_model
    keys = dismod3.utils.gbd_keys(region_list=['north_america_high_income'],
                                  year_list=[1990], sex_list=['male'])
    gbd_disease_model.fit(dm, method='map', keys=keys, verbose=1)     ## first generate decent initial conditions
    gbd_disease_model.fit(dm, method='mcmc', keys=keys, iter=30000, thin=15, burn=15000, verbose=1, dbname='/dev/null')     ## then sample the posterior via MCMC
    #gbd_disease_model.fit(dm, method='mcmc', keys=keys, iter=1000, thin=1, burn=0, verbose=1, dbname='/dev/null')     ## fast for dev


    print 'error compared to the noisy data (coefficient of variation = %.2f)' % cv
    check_posterior_fits(dm)

    dm.data = all_data
    for d in dm.data:
        if d['data_type'] != 'all-cause mortality data':
            d['noisy_data'] = d['value']
            d['value'] = d['truth']

    print 'error compared to the truth'
    are, coverage = check_posterior_fits(dm)
    print
    print 'Median Absolute Relative Error of Posterior Predictions:', median(are)
    print 'Pct coverage:', 100*mean(coverage)
    f = open('score_%d_%f.txt' % (n, cv), 'a')
    f.write('%10.10f,%10.10f\n' % (median(are), mean(coverage)))
    f.close()

    dm.all_data = all_data
    dm.data = sampled_data
    for d in dm.data:
        if d['data_type'] != 'all-cause mortality data':
            d['value'] = d['noisy_data']

    generate_figure(dm, n, cv)

    return dm