예제 #1
0
def validate_ai_re(N=500, delta_true=.15, sigma_true=[.1,.1,.1,.1,.1], pi_true=quadratic, smoothness='Moderately', heterogeneity='Slightly'):
    ## generate simulated data
    a = pl.arange(0, 101, 1)
    pi_age_true = pi_true(a)


    import dismod3
    import simplejson as json
    model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json()))
    gbd_hierarchy = model.hierarchy

    model = data_simulation.simple_model(N)
    model.hierarchy = gbd_hierarchy

    model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10)
    model.parameters['p']['smoothness'] = dict(amount=smoothness)
    model.parameters['p']['heterogeneity'] = heterogeneity

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    age_weights = pl.ones_like(a)
    sum_pi_wt = pl.cumsum(pi_age_true*age_weights)
    sum_wt = pl.cumsum(age_weights*1.)
    p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] - sum_wt[age_start])

    # correct cases where age_start == age_end
    i = age_start == age_end
    if pl.any(i):
        p[i] = pi_age_true[age_start[i]]

    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N)


    from validate_covariates import alpha_true_sim
    area_list = pl.array(['all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT', 'IRN', 'IRQ', 'JOR', 'SYR'])
    alpha = alpha_true_sim(model, area_list, sigma_true)
    print alpha

    model.input_data['true'] = pl.nan

    model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)]
    
    for i, a in model.input_data['area'].iteritems():
        model.input_data['true'][i] = p[i] * pl.exp(pl.sum([alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha]))
    p = model.input_data['true']

    n = model.input_data['effective_sample_size']
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'north_africa_middle_east', 'total', 'all', None, None, None)
    #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)
    graphics.plot_one_type(model, model.vars['p'], {}, 'p')
    pl.plot(range(101), pi_age_true, 'r:', label='Truth')
    pl.legend(fancybox=True, shadow=True, loc='upper left')

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    data_simulation.add_quality_metrics(model.delta)

    model.alpha = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)])
    model.alpha['true'] = pandas.Series(dict(alpha))
    model.alpha['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns)
    model.alpha['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns)
    model.alpha = model.alpha.dropna()
    data_simulation.add_quality_metrics(model.alpha)

    model.sigma = pandas.DataFrame(dict(true=sigma_true))
    model.sigma['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['sigma_alpha']]
    model.sigma['sigma_pred']=[n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']]
    data_simulation.add_quality_metrics(model.sigma)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(),
                                                     pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
                                                                       model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame(dict(true=pi_age_true,
                                     mu_pred=model.vars['p']['mu_age'].stats()['mean'],
                                     sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation']))
    data_simulation.add_quality_metrics(model.mu)

    data_simulation.initialize_results(model)
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    data_simulation.add_to_results(model, 'alpha')
    data_simulation.add_to_results(model, 'sigma')
    data_simulation.finalize_results(model)

    print model.results

    return model
예제 #2
0
def validate_age_integrating_model_sim(N=500,
                                       delta_true=.15,
                                       pi_true=quadratic):
    ## generate simulated data
    a = pl.arange(0, 101, 1)
    pi_age_true = pi_true(a)

    model = data_simulation.simple_model(N)
    #model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10)
    #model.parameters['p']['smoothness'] = dict(amount='Very')

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    age_weights = pl.ones_like(a)
    sum_pi_wt = pl.cumsum(pi_age_true * age_weights)
    sum_wt = pl.cumsum(age_weights)
    p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] -
                                                       sum_wt[age_start])

    # correct cases where age_start == age_end
    i = age_start == age_end
    if pl.any(i):
        p[i] = pi_age_true[age_start[i]]

    n = mc.runiform(100, 10000, size=N)

    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = n
    model.input_data['true'] = p
    model.input_data['value'] = mc.rnegative_binomial(n * p,
                                                      delta_true * n * p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total',
                                            'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'],
                                                     iter=10000,
                                                     burn=5000,
                                                     thin=25,
                                                     tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)
    graphics.plot_one_type(model, model.vars['p'], {}, 'p')
    pl.plot(a, pi_age_true, 'r:', label='Truth')
    pl.legend(fancybox=True, shadow=True, loc='upper left')

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats(
    )['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    data_simulation.add_quality_metrics(model.delta)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.input_data['abs_err'].mean(),
        pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
        model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame(
        dict(true=pi_age_true,
             mu_pred=model.vars['p']['mu_age'].stats()['mean'],
             sigma_pred=model.vars['p']['mu_age'].stats()
             ['standard deviation']))
    data_simulation.add_quality_metrics(model.mu)

    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    model.results = pandas.DataFrame(model.results,
                                     columns='param bias mae mare pc'.split())

    print model.results

    return model
예제 #3
0
def validate_covariate_model_fe(N=100,
                                delta_true=3,
                                pi_true=.01,
                                beta_true=[.5, -.5, 0.],
                                replicate=0):
    # set random seed for reproducibility
    mc.np.random.seed(1234567 + replicate)

    ## generate simulated data
    a = pl.arange(0, 100, 1)
    pi_age_true = pi_true * pl.ones_like(a)

    model = data.ModelData()
    model.parameters['p']['parameter_age_mesh'] = [0, 100]
    model.input_data = pandas.DataFrame(index=range(N))
    initialize_input_data(model.input_data)

    # add fixed effect to simulated data
    X = mc.rnormal(0., 1.**-2, size=(N, len(beta_true)))
    Y_true = pl.dot(X, beta_true)

    for i in range(len(beta_true)):
        model.input_data['x_%d' % i] = X[:, i]
    model.input_data['true'] = pi_true * pl.exp(Y_true)

    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N)

    n = model.input_data['effective_sample_size']
    p = model.input_data['true']
    model.input_data['value'] = mc.rnegative_binomial(n * p, delta_true) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total',
                                            'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'],
                                                     iter=10000,
                                                     burn=5000,
                                                     thin=5,
                                                     tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats(
    )['standard deviation']
    add_quality_metrics(model.input_data)

    model.beta = pandas.DataFrame(index=model.vars['p']['X'].columns)
    model.beta['true'] = 0.
    for i in range(len(beta_true)):
        model.beta['true']['x_%d' % i] = beta_true[i]

    model.beta['mu_pred'] = [
        n.stats()['mean'] for n in model.vars['p']['beta']
    ]
    model.beta['sigma_pred'] = [
        n.stats()['standard deviation'] for n in model.vars['p']['beta']
    ]
    add_quality_metrics(model.beta)

    print '\nbeta'
    print model.beta

    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    add_to_results(model, 'beta')

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    add_quality_metrics(model.delta)

    print 'delta'
    print model.delta
    add_to_results(model, 'delta')

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.input_data['abs_err'].mean(),
        pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
        model.input_data['covered?'].mean())
    print 'effect prediction MAE: %.3f, coverage: %.2f' % (
        pl.median(pl.absolute(model.beta['abs_err'].dropna())),
        model.beta.dropna()['covered?'].mean())
    add_to_results(model, 'input_data')
    add_to_results(model, 'beta')

    model.results = pandas.DataFrame(model.results)
    return model
예제 #4
0
def validate_age_pattern_model_sim(N=500, delta_true=.15, pi_true=quadratic):
    ## generate simulated data
    a = pl.arange(0, 101, 1)
    pi_age_true = pi_true(a)

    model = data_simulation.simple_model(N)
    model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10)

    age_list = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    p = pi_age_true[age_list]
    n = mc.runiform(100, 10000, size=N)

    model.input_data['age_start'] = age_list
    model.input_data['age_end'] = age_list
    model.input_data['effective_sample_size'] = n
    model.input_data['true'] = p
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=25, tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)
    graphics.plot_one_type(model, model.vars['p'], {}, 'p')
    pl.plot(a, pi_age_true, 'r:', label='Truth')
    pl.legend(fancybox=True, shadow=True, loc='upper left')

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    data_simulation.add_quality_metrics(model.delta)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(),
                                                     pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
                                                                       model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame(dict(true=pi_age_true,
                                     mu_pred=model.vars['p']['mu_age'].stats()['mean'],
                                     sigma_pred=model.vars['p']['mu_age'].stats()['standard deviation']))
    data_simulation.add_quality_metrics(model.mu)

    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split())

    print model.results

    return model
예제 #5
0
def validate_covariate_model_re(N=500,
                                delta_true=.15,
                                pi_true=.01,
                                sigma_true=[.1, .1, .1, .1, .1],
                                ess=1000):
    ## set simulation parameters
    import dismod3
    import simplejson as json
    model = data.ModelData.from_gbd_jsons(
        json.loads(dismod3.disease_json.DiseaseJson().to_json()))
    model.parameters['p']['parameter_age_mesh'] = [0, 100]
    model.parameters['p'][
        'heterogeneity'] = 'Slightly'  # ensure heterogeneity is slightly

    area_list = []
    for sr in sorted(model.hierarchy.successors('all')):
        area_list.append(sr)
        for r in sorted(model.hierarchy.successors(sr)):
            area_list.append(r)
            area_list += sorted(model.hierarchy.successors(r))[:5]
    area_list = pl.array(area_list)

    ## generate simulation data
    model.input_data = pandas.DataFrame(index=range(N))
    initialize_input_data(model.input_data)

    alpha = alpha_true_sim(model, area_list, sigma_true)

    # choose observed prevalence values
    model.input_data['effective_sample_size'] = ess

    model.input_data['area'] = area_list[mc.rcategorical(
        pl.ones(len(area_list)) / float(len(area_list)), N)]

    model.input_data['true'] = pl.nan
    for i, a in model.input_data['area'].iteritems():
        model.input_data['true'][i] = pi_true * pl.exp(
            pl.sum([
                alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a)
                if n in alpha
            ]))

    n = model.input_data['effective_sample_size']
    p = model.input_data['true']
    model.input_data['value'] = mc.rnegative_binomial(n * p,
                                                      delta_true * n * p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total',
                                            'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'],
                                                     iter=20000,
                                                     burn=10000,
                                                     thin=10,
                                                     tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats(
    )['standard deviation']
    add_quality_metrics(model.input_data)

    model.alpha = pandas.DataFrame(
        index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)])
    model.alpha['true'] = pandas.Series(dict(alpha))
    model.alpha['mu_pred'] = pandas.Series(
        [n.stats()['mean'] for n in model.vars['p']['alpha']],
        index=model.vars['p']['U'].columns)
    model.alpha['sigma_pred'] = pandas.Series(
        [n.stats()['standard deviation'] for n in model.vars['p']['alpha']],
        index=model.vars['p']['U'].columns)
    add_quality_metrics(model.alpha)

    print '\nalpha'
    print model.alpha.dropna()

    model.sigma = pandas.DataFrame(dict(true=sigma_true))
    model.sigma['mu_pred'] = [
        n.stats()['mean'] for n in model.vars['p']['sigma_alpha']
    ]
    model.sigma['sigma_pred'] = [
        n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']
    ]
    add_quality_metrics(model.sigma)

    print 'sigma_alpha'
    print model.sigma

    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    add_to_results(model, 'sigma')

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    add_quality_metrics(model.delta)

    print 'delta'
    print model.delta
    add_to_results(model, 'delta')

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.input_data['abs_err'].mean(),
        pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
        model.input_data['covered?'].mean())
    print 'effect prediction MAE: %.3f, coverage: %.2f' % (
        pl.median(pl.absolute(model.alpha['abs_err'].dropna())),
        model.alpha.dropna()['covered?'].mean())
    add_to_results(model, 'input_data')
    add_to_results(model, 'alpha')

    model.results = pandas.DataFrame(model.results)
    return model
예제 #6
0
def validate_covariate_model_dispersion(N=1000,
                                        delta_true=.15,
                                        pi_true=.01,
                                        zeta_true=[.5, -.5, 0.]):
    ## generate simulated data
    a = pl.arange(0, 100, 1)
    pi_age_true = pi_true * pl.ones_like(a)

    model = data.ModelData()
    model.parameters['p']['parameter_age_mesh'] = [0, 100]
    model.input_data = pandas.DataFrame(index=range(N))
    initialize_input_data(model.input_data)

    Z = mc.rbernoulli(.5, size=(N, len(zeta_true))) * 1.0
    delta = delta_true * pl.exp(pl.dot(Z, zeta_true))
    for i in range(len(zeta_true)):
        model.input_data['z_%d' % i] = Z[:, i]

    model.input_data['true'] = pi_true

    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N)

    n = model.input_data['effective_sample_size']
    p = model.input_data['true']
    model.input_data['value'] = mc.rnegative_binomial(n * p, delta * n * p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total',
                                            'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'],
                                                     iter=10000,
                                                     burn=5000,
                                                     thin=5,
                                                     tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats(
    )['standard deviation']
    add_quality_metrics(model.input_data)

    model.zeta = pandas.DataFrame(index=model.vars['p']['Z'].columns)
    model.zeta['true'] = zeta_true

    model.zeta['mu_pred'] = model.vars['p']['zeta'].stats()['mean']
    model.zeta['sigma_pred'] = model.vars['p']['zeta'].stats(
    )['standard deviation']
    add_quality_metrics(model.zeta)

    print '\nzeta'
    print model.zeta

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    add_quality_metrics(model.delta)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.input_data['abs_err'].mean(),
        pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
        model.input_data['covered?'].mean())
    print 'effect prediction MAE: %.3f, coverage: %.2f' % (
        pl.median(pl.absolute(model.zeta['abs_err'].dropna())),
        model.zeta.dropna()['covered?'].mean())

    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    add_to_results(model, 'delta')
    add_to_results(model, 'input_data')
    add_to_results(model, 'zeta')
    model.results = pandas.DataFrame(model.results,
                                     columns='param bias mae mare pc'.split())

    return model
예제 #7
0
def fit_world(id, fast_fit=False, zero_re=True, alt_prior=False, global_heterogeneity='Slightly'):
    """ Fit consistent for all data in world

    Parameters
    ----------
    id : int
      The model id number for the job to fit

    Example
    -------
    >>> import fit_world
    >>> dm = fit_world.dismod3.load_disease_model(1234)
    >>> fit_world.fit_world(dm)
    """

    dir = dismod3.settings.JOB_WORKING_DIR % id

    ## load the model from disk or from web
    import simplejson as json
    import data
    reload(data)

    try:
        model = data.ModelData.load(dir)
        print 'loaded data from new format from %s' % dir
        dm = dismod3.load_disease_model(id)
    except (IOError, AssertionError):
        dm = dismod3.load_disease_model(id)
        model = data.ModelData.from_gbd_jsons(json.loads(dm.to_json()))
        try:
            model.save(dir)
            print 'loaded data from json, saved in new format for next time in %s' % dir
        except IOError:
            print 'loaded data from json, failed to save in new format'


    ## next block fills in missing covariates with zero
    for col in model.input_data.columns:
        if col.startswith('x_'):
            model.input_data[col] = model.input_data[col].fillna(0.)
    # also fill all covariates missing in output template with zeros
    model.output_template = model.output_template.fillna(0)

    # set all heterogeneity priors to Slightly for the global fit
    for t in model.parameters:
        if 'heterogeneity' in model.parameters[t]:
            model.parameters[t]['heterogeneity'] = global_heterogeneity

    ### For testing:
    ## speed up computation by reducing number of knots
    ## for t in 'irf':
    ##     model.parameters[t]['parameter_age_mesh'] = [0, 100]
    model.vars += dismod3.ism.consistent(model,
                                         reference_area='all',
                                         reference_sex='total',
                                         reference_year='all',
                                         priors={},
                                         zero_re=zero_re)

    ## fit model to data
    if fast_fit:
        dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, 105, 0, 1, 100)
    else:
        dm.map, dm.mcmc = dismod3.fit.fit_consistent(model, iter=50000, burn=10000, thin=40, tune_interval=1000, verbose=True)

    dm.model = model

    # borrow strength to inform sigma_alpha between rate types post-hoc
    types_with_re = ['rr', 'f', 'i', 'm', 'smr', 'p', 'r', 'pf', 'm_with', 'X']
    ## first calculate sigma_alpha_bar from posterior draws from each alpha
    alpha_vals = []
    for type in types_with_re:
        if 'alpha' in model.vars[type]:
            for alpha_i in model.vars[type]['alpha']:
                alpha_vals += [a for a in alpha_i.trace() if a != 0]  # remove zeros because areas with no siblings are included for convenience but are pinned to zero
    ## then blend sigma_alpha_i and sigma_alpha_bar for each sigma_alpha_i
    if len(alpha_vals) > 0:
        sigma_alpha_bar = pl.std(alpha_vals)
        for type in types_with_re:
            if 'sigma_alpha' in model.vars[type]:
                for sigma_alpha_i in model.vars[type]['sigma_alpha']:
                    cur_val = sigma_alpha_i.trace()
                    sigma_alpha_i.trace._trace[0] = (cur_val + sigma_alpha_bar) * pl.ones_like(sigma_alpha_i.trace._trace[0])


    for t in 'p i r f rr pf m_with'.split():
        param_type = dict(i='incidence', r='remission', f='excess-mortality', p='prevalence', rr='relative-risk', pf='prevalence_x_excess-mortality', m_with='mortality')[t]
        #graphics.plot_one_type(model, model.vars[t], {}, t)
        for a in [dismod3.utils.clean(a) for a in dismod3.settings.gbd_regions]:
            print 'generating empirical prior for %s' % a
            for s in dismod3.settings.gbd_sexes:
                for y in dismod3.settings.gbd_years:
                    key = dismod3.utils.gbd_key_for(param_type, a, y, s)
                    if t in model.parameters and 'level_bounds' in model.parameters[t]:
                        lower=model.parameters[t]['level_bounds']['lower']
                        upper=model.parameters[t]['level_bounds']['upper']
                    else:
                        lower=0
                        upper=pl.inf
                        
                    emp_priors = covariate_model.predict_for(model,
                                                             model.parameters.get(t, {}),
                                                             'all', 'total', 'all',
                                                             a, dismod3.utils.clean(s), int(y),
                                                             alt_prior,
                                                             model.vars[t], lower, upper)
                    dm.set_mcmc('emp_prior_mean', key, emp_priors.mean(0))
                    if 'eta' in model.vars[t]:
                        N,A = emp_priors.shape  # N samples, for A age groups
                        delta_trace = pl.transpose([pl.exp(model.vars[t]['eta'].trace()) for _ in range(A)])  # shape delta matrix to match prediction matrix
                        emp_prior_std = pl.sqrt(emp_priors.var(0) + (emp_priors**2 / delta_trace).mean(0))
                    else:
                        emp_prior_std = emp_priors.std(0)
                    dm.set_mcmc('emp_prior_std', key, emp_prior_std)


        from fit_emp_prior import store_effect_coefficients
        store_effect_coefficients(dm, model.vars[t], param_type)

    
        if 'p_pred' in model.vars[t]:
            graphics.plot_one_ppc(model, t)
            pl.savefig(dir + '/prior-%s-ppc.png'%param_type)

        if 'p_pred' in model.vars[t] or 'lb' in model.vars[t]:
            graphics.plot_one_effects(model, t)
            pl.savefig(dir + '/prior-%s-effects.png'%param_type)


    for t in 'i r f p rr pf X m_with smr'.split():
        fname = dir + '/empirical_priors/data-%s.csv'%t
        print 'saving tables for', t, 'to', fname
        if 'data' in model.vars[t] and 'p_pred' in model.vars[t]:
            stats = model.vars[t]['p_pred'].stats(batches=5)
            model.vars[t]['data']['mu_pred'] = stats['mean']
            model.vars[t]['data']['sigma_pred'] = stats['standard deviation']

            stats = model.vars[t]['pi'].stats(batches=5)
            model.vars[t]['data']['mc_error'] = stats['mc error']

            model.vars[t]['data']['residual'] = model.vars[t]['data']['value'] - model.vars[t]['data']['mu_pred']
            model.vars[t]['data']['abs_residual'] = pl.absolute(model.vars[t]['data']['residual'])
            #if 'delta' in model.vars[t]:
            #    model.vars[t]['data']['logp'] = [mc.negative_binomial_like(n*p_obs, n*p_pred, n*p_pred*d) for n, p_obs, p_pred, d \
            #                                  in zip(model.vars[t]['data']['effective_sample_size'],
            #                                         model.vars[t]['data']['value'],
            #                                         model.vars[t]['data']['mu_pred'],
            #                                         pl.atleast_1d(model.vars[t]['delta'].stats()['mean']))]
            model.vars[t]['data'].to_csv(fname)


    graphics.plot_fit(model)
    pl.savefig(dir + '/prior.png')

    graphics.plot_acorr(model)
    pl.savefig(dir + '/prior-convergence.png')

    graphics.plot_trace(model)
    pl.savefig(dir + '/prior-trace.png')
    
    # save results (do this last, because it removes things from the disease model that plotting function, etc, might need
    try:
        dm.save('dm-%d-prior-%s.json' % (dm.id, 'all'))
    except IOError, e:
        print e
def validate_ai_re(N=500,
                   delta_true=.15,
                   sigma_true=[.1, .1, .1, .1, .1],
                   pi_true=quadratic,
                   smoothness='Moderately',
                   heterogeneity='Slightly'):
    ## generate simulated data
    a = pl.arange(0, 101, 1)
    pi_age_true = pi_true(a)

    import dismod3
    import simplejson as json
    model = data.ModelData.from_gbd_jsons(
        json.loads(dismod3.disease_json.DiseaseJson().to_json()))
    gbd_hierarchy = model.hierarchy

    model = data_simulation.simple_model(N)
    model.hierarchy = gbd_hierarchy

    model.parameters['p']['parameter_age_mesh'] = range(0, 101, 10)
    model.parameters['p']['smoothness'] = dict(amount=smoothness)
    model.parameters['p']['heterogeneity'] = heterogeneity

    age_start = pl.array(mc.runiform(0, 100, size=N), dtype=int)
    age_end = pl.array(mc.runiform(age_start, 100, size=N), dtype=int)

    age_weights = pl.ones_like(a)
    sum_pi_wt = pl.cumsum(pi_age_true * age_weights)
    sum_wt = pl.cumsum(age_weights * 1.)
    p = (sum_pi_wt[age_end] - sum_pi_wt[age_start]) / (sum_wt[age_end] -
                                                       sum_wt[age_start])

    # correct cases where age_start == age_end
    i = age_start == age_end
    if pl.any(i):
        p[i] = pi_age_true[age_start[i]]

    model.input_data['age_start'] = age_start
    model.input_data['age_end'] = age_end
    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, size=N)

    from validate_covariates import alpha_true_sim
    area_list = pl.array([
        'all', 'super-region_3', 'north_africa_middle_east', 'EGY', 'KWT',
        'IRN', 'IRQ', 'JOR', 'SYR'
    ])
    alpha = alpha_true_sim(model, area_list, sigma_true)
    print alpha

    model.input_data['true'] = pl.nan

    model.input_data['area'] = area_list[mc.rcategorical(
        pl.ones(len(area_list)) / float(len(area_list)), N)]

    for i, a in model.input_data['area'].iteritems():
        model.input_data['true'][i] = p[i] * pl.exp(
            pl.sum([
                alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a)
                if n in alpha
            ]))
    p = model.input_data['true']

    n = model.input_data['effective_sample_size']
    model.input_data['value'] = mc.rnegative_binomial(n * p,
                                                      delta_true * n * p) / n

    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p',
                                            'north_africa_middle_east',
                                            'total', 'all', None, None, None)
    #model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=1005, burn=500, thin=5, tune_interval=100)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'],
                                                     iter=10000,
                                                     burn=5000,
                                                     thin=25,
                                                     tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)
    graphics.plot_one_type(model, model.vars['p'], {}, 'p')
    pl.plot(range(101), pi_age_true, 'r:', label='Truth')
    pl.legend(fancybox=True, shadow=True, loc='upper left')

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats(
    )['standard deviation']
    data_simulation.add_quality_metrics(model.input_data)

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    data_simulation.add_quality_metrics(model.delta)

    model.alpha = pandas.DataFrame(
        index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)])
    model.alpha['true'] = pandas.Series(dict(alpha))
    model.alpha['mu_pred'] = pandas.Series(
        [n.stats()['mean'] for n in model.vars['p']['alpha']],
        index=model.vars['p']['U'].columns)
    model.alpha['sigma_pred'] = pandas.Series(
        [n.stats()['standard deviation'] for n in model.vars['p']['alpha']],
        index=model.vars['p']['U'].columns)
    model.alpha = model.alpha.dropna()
    data_simulation.add_quality_metrics(model.alpha)

    model.sigma = pandas.DataFrame(dict(true=sigma_true))
    model.sigma['mu_pred'] = [
        n.stats()['mean'] for n in model.vars['p']['sigma_alpha']
    ]
    model.sigma['sigma_pred'] = [
        n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']
    ]
    data_simulation.add_quality_metrics(model.sigma)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (
        model.input_data['abs_err'].mean(),
        pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
        model.input_data['covered?'].mean())

    model.mu = pandas.DataFrame(
        dict(true=pi_age_true,
             mu_pred=model.vars['p']['mu_age'].stats()['mean'],
             sigma_pred=model.vars['p']['mu_age'].stats()
             ['standard deviation']))
    data_simulation.add_quality_metrics(model.mu)

    data_simulation.initialize_results(model)
    data_simulation.add_to_results(model, 'delta')
    data_simulation.add_to_results(model, 'mu')
    data_simulation.add_to_results(model, 'input_data')
    data_simulation.add_to_results(model, 'alpha')
    data_simulation.add_to_results(model, 'sigma')
    data_simulation.finalize_results(model)

    print model.results

    return model
예제 #9
0
def validate_covariate_model_fe(N=100, delta_true=3, pi_true=.01, beta_true=[.5, -.5, 0.], replicate=0):
    # set random seed for reproducibility
    mc.np.random.seed(1234567 + replicate)
    
    ## generate simulated data
    a = pl.arange(0, 100, 1)
    pi_age_true = pi_true * pl.ones_like(a)

    model = data.ModelData()
    model.parameters['p']['parameter_age_mesh'] = [0, 100]
    model.input_data = pandas.DataFrame(index=range(N))
    initialize_input_data(model.input_data)

    # add fixed effect to simulated data
    X = mc.rnormal(0., 1.**-2, size=(N,len(beta_true)))
    Y_true = pl.dot(X, beta_true)

    for i in range(len(beta_true)):
        model.input_data['x_%d'%i] = X[:,i]
    model.input_data['true'] = pi_true * pl.exp(Y_true)

    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N)

    n = model.input_data['effective_sample_size']
    p = model.input_data['true']
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true) / n


    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation']
    add_quality_metrics(model.input_data)


    model.beta = pandas.DataFrame(index=model.vars['p']['X'].columns)
    model.beta['true'] = 0.
    for i in range(len(beta_true)):
        model.beta['true']['x_%d'%i] = beta_true[i]
    
    model.beta['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['beta']]
    model.beta['sigma_pred'] = [n.stats()['standard deviation'] for n in model.vars['p']['beta']]
    add_quality_metrics(model.beta)

    print '\nbeta'
    print model.beta
    
    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    add_to_results(model, 'beta')

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    add_quality_metrics(model.delta)

    print 'delta'
    print model.delta
    add_to_results(model, 'delta')

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(),
                                                     pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
                                                                       model.input_data['covered?'].mean())
    print 'effect prediction MAE: %.3f, coverage: %.2f' % (pl.median(pl.absolute(model.beta['abs_err'].dropna())),
                                                           model.beta.dropna()['covered?'].mean())
    add_to_results(model, 'input_data')
    add_to_results(model, 'beta')

    model.results = pandas.DataFrame(model.results)
    return model
예제 #10
0
def validate_covariate_model_dispersion(N=1000, delta_true=.15, pi_true=.01, zeta_true=[.5, -.5, 0.]):
    ## generate simulated data
    a = pl.arange(0, 100, 1)
    pi_age_true = pi_true * pl.ones_like(a)

    model = data.ModelData()
    model.parameters['p']['parameter_age_mesh'] = [0, 100]
    model.input_data = pandas.DataFrame(index=range(N))
    initialize_input_data(model.input_data)

    Z = mc.rbernoulli(.5, size=(N, len(zeta_true))) * 1.0
    delta = delta_true * pl.exp(pl.dot(Z, zeta_true))
    for i in range(len(zeta_true)):
        model.input_data['z_%d'%i] = Z[:,i]

    model.input_data['true'] = pi_true

    model.input_data['effective_sample_size'] = mc.runiform(100, 10000, N)

    n = model.input_data['effective_sample_size']
    p = model.input_data['true']
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta*n*p) / n


    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=10000, burn=5000, thin=5, tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)

    pl.show()


    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation']
    add_quality_metrics(model.input_data)


    model.zeta = pandas.DataFrame(index=model.vars['p']['Z'].columns)
    model.zeta['true'] = zeta_true
    
    model.zeta['mu_pred'] = model.vars['p']['zeta'].stats()['mean']
    model.zeta['sigma_pred'] = model.vars['p']['zeta'].stats()['standard deviation']
    add_quality_metrics(model.zeta)

    print '\nzeta'
    print model.zeta
    
    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    add_quality_metrics(model.delta)

    print 'delta'
    print model.delta

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(),
                                                     pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
                                                                       model.input_data['covered?'].mean())
    print 'effect prediction MAE: %.3f, coverage: %.2f' % (pl.median(pl.absolute(model.zeta['abs_err'].dropna())),
                                                           model.zeta.dropna()['covered?'].mean())


    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    add_to_results(model, 'delta')
    add_to_results(model, 'input_data')
    add_to_results(model, 'zeta')
    model.results = pandas.DataFrame(model.results, columns='param bias mae mare pc'.split())

    return model
예제 #11
0
def validate_covariate_model_re(N=500, delta_true=.15, pi_true=.01, sigma_true = [.1,.1,.1,.1,.1], ess=1000):
    ## set simulation parameters
    import dismod3
    import simplejson as json
    model = data.ModelData.from_gbd_jsons(json.loads(dismod3.disease_json.DiseaseJson().to_json()))
    model.parameters['p']['parameter_age_mesh'] = [0, 100]
    model.parameters['p']['heterogeneity'] = 'Slightly'  # ensure heterogeneity is slightly

    area_list = []
    for sr in sorted(model.hierarchy.successors('all')):
        area_list.append(sr)
        for r in sorted(model.hierarchy.successors(sr)):
            area_list.append(r)
            area_list += sorted(model.hierarchy.successors(r))[:5]
    area_list = pl.array(area_list)


    ## generate simulation data
    model.input_data = pandas.DataFrame(index=range(N))
    initialize_input_data(model.input_data)

    alpha = alpha_true_sim(model, area_list, sigma_true)

    # choose observed prevalence values
    model.input_data['effective_sample_size'] = ess

    model.input_data['area'] = area_list[mc.rcategorical(pl.ones(len(area_list)) / float(len(area_list)), N)]

    model.input_data['true'] = pl.nan
    for i, a in model.input_data['area'].iteritems():
        model.input_data['true'][i] = pi_true * pl.exp(pl.sum([alpha[n] for n in nx.shortest_path(model.hierarchy, 'all', a) if n in alpha]))

    n = model.input_data['effective_sample_size']
    p = model.input_data['true']
    model.input_data['value'] = mc.rnegative_binomial(n*p, delta_true*n*p) / n



    ## Then fit the model and compare the estimates to the truth
    model.vars = {}
    model.vars['p'] = data_model.data_model('p', model, 'p', 'all', 'total', 'all', None, None, None)
    model.map, model.mcmc = fit_model.fit_data_model(model.vars['p'], iter=20000, burn=10000, thin=10, tune_interval=100)

    graphics.plot_one_ppc(model.vars['p'], 'p')
    graphics.plot_convergence_diag(model.vars)

    pl.show()

    model.input_data['mu_pred'] = model.vars['p']['p_pred'].stats()['mean']
    model.input_data['sigma_pred'] = model.vars['p']['p_pred'].stats()['standard deviation']
    add_quality_metrics(model.input_data)


    model.alpha = pandas.DataFrame(index=[n for n in nx.traversal.dfs_preorder_nodes(model.hierarchy)])
    model.alpha['true'] = pandas.Series(dict(alpha))
    model.alpha['mu_pred'] = pandas.Series([n.stats()['mean'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns)
    model.alpha['sigma_pred'] = pandas.Series([n.stats()['standard deviation'] for n in model.vars['p']['alpha']], index=model.vars['p']['U'].columns)
    add_quality_metrics(model.alpha)

    print '\nalpha'
    print model.alpha.dropna()


    model.sigma = pandas.DataFrame(dict(true=sigma_true))
    model.sigma['mu_pred'] = [n.stats()['mean'] for n in model.vars['p']['sigma_alpha']]
    model.sigma['sigma_pred']=[n.stats()['standard deviation'] for n in model.vars['p']['sigma_alpha']]
    add_quality_metrics(model.sigma)

    print 'sigma_alpha'
    print model.sigma

    
    model.results = dict(param=[], bias=[], mare=[], mae=[], pc=[])
    add_to_results(model, 'sigma')

    model.delta = pandas.DataFrame(dict(true=[delta_true]))
    model.delta['mu_pred'] = pl.exp(model.vars['p']['eta'].trace()).mean()
    model.delta['sigma_pred'] = pl.exp(model.vars['p']['eta'].trace()).std()
    add_quality_metrics(model.delta)

    print 'delta'
    print model.delta
    add_to_results(model, 'delta')

    print '\ndata prediction bias: %.5f, MARE: %.3f, coverage: %.2f' % (model.input_data['abs_err'].mean(),
                                                     pl.median(pl.absolute(model.input_data['rel_err'].dropna())),
                                                                       model.input_data['covered?'].mean())
    print 'effect prediction MAE: %.3f, coverage: %.2f' % (pl.median(pl.absolute(model.alpha['abs_err'].dropna())),
                                                          model.alpha.dropna()['covered?'].mean())
    add_to_results(model, 'input_data')
    add_to_results(model, 'alpha')

    model.results = pandas.DataFrame(model.results)
    return model