Пример #1
0
def validate_once(true_cf=[pl.ones(3) / 3.0,
                           pl.ones(3) / 3.0],
                  true_std=0.01 * pl.ones(3),
                  std_bias=[1., 1., 1.],
                  save=False,
                  dir='',
                  i=0):
    """
    Generate a set of simulated estimates for the provided true cause fractions; Fit the bad model and 
    the latent simplex model to this simulated data and calculate quality metrics. 
    """

    # generate simulation data
    X = data.sim_data_for_validation(1000, true_cf, true_std, std_bias)

    # fit bad model, calculate fit metrics
    bad_model = models.bad_model(X)
    bad_model_metrics = calc_quality_metrics(true_cf, true_std, std_bias,
                                             bad_model)
    retrieve_estimates(bad_model, True, 'bad_model', dir, i)

    # fit latent simplex model, calculate fit metrics
    m, latent_simplex = models.fit_latent_simplex(X)
    latent_simplex_metrics = calc_quality_metrics(true_cf, true_std, std_bias,
                                                  latent_simplex)
    retrieve_estimates(latent_simplex, True, 'latent_simplex', dir, i)

    # either write results to disk or return them
    if save:
        pl.rec2csv(bad_model_metrics, '%s/metrics_bad_model_%i.csv' % (dir, i))
        pl.rec2csv(latent_simplex_metrics,
                   '%s/metrics_latent_simplex_%i.csv' % (dir, i))
    else:
        return bad_model_metrics, latent_simplex_metrics
Пример #2
0
def validate_once(true_cf = [pl.ones(3)/3.0, pl.ones(3)/3.0], true_std = 0.01*pl.ones(3), std_bias = [1., 1., 1.], save=False, dir='', i=0):
    """
    Generate a set of simulated estimates for the provided true cause fractions; Fit the bad model and 
    the latent simplex model to this simulated data and calculate quality metrics. 
    """ 
    
    # generate simulation data
    X = data.sim_data_for_validation(1000, true_cf, true_std, std_bias)

    # fit bad model, calculate fit metrics 
    bad_model = models.bad_model(X)
    bad_model_metrics = calc_quality_metrics(true_cf, true_std, std_bias, bad_model)
    retrieve_estimates(bad_model, True, 'bad_model', dir, i)
    
    # fit latent simplex model, calculate fit metrics 
    m, latent_simplex = models.fit_latent_simplex(X)
    latent_simplex_metrics = calc_quality_metrics(true_cf, true_std, std_bias, latent_simplex)
    retrieve_estimates(latent_simplex, True, 'latent_simplex', dir, i)
    
    # either write results to disk or return them 
    if save: 
        pl.rec2csv(bad_model_metrics, '%s/metrics_bad_model_%i.csv' % (dir, i)) 
        pl.rec2csv(latent_simplex_metrics, '%s/metrics_latent_simplex_%i.csv' % (dir, i))
    else: 
        return bad_model_metrics, latent_simplex_metrics
Пример #3
0
def combine_output(J, T, model, dir, reps, save=False):
    """
    Combine output on absolute error, relative error, csmf_accuracy, and coverage from from
    multiple runs of validate_once. Either saves the output to the disk, or returns arays
    for each. 
    """

    cause = pl.zeros(J*T, dtype='f').view(pl.recarray)
    time = pl.zeros(J*T, dtype='f').view(pl.recarray)
    abs_err = pl.zeros(J*T, dtype='f').view(pl.recarray) 
    rel_err = pl.zeros(J*T, dtype='f').view(pl.recarray)
    coverage = pl.zeros(J*T, dtype='f').view(pl.recarray)
    csmf_accuracy = pl.zeros(J*T, dtype='f').view(pl.recarray)

    for i in range(reps): 
        metrics = pl.csv2rec('%s/metrics_%s_%i.csv' % (dir, model, i))
        cause = pl.vstack((cause, metrics.cause))
        time = pl.vstack((time, metrics.time))
        abs_err = pl.vstack((abs_err, metrics.abs_err))
        rel_err = pl.vstack((rel_err, metrics.rel_err))
        coverage = pl.vstack((coverage, metrics.coverage))
        csmf_accuracy = pl.vstack((csmf_accuracy, metrics.csmf_accuracy))

    cause = cause[1:,]
    time = time[1:,]    
    abs_err = abs_err[1:,]
    rel_err = rel_err[1:,]
    coverage = coverage[1:,]
    csmf_accuracy = csmf_accuracy[1:,]

    mean_abs_err = abs_err.mean(0)
    median_abs_err =  pl.median(abs_err, 0)
    mean_rel_err = rel_err.mean(0)
    median_rel_err = pl.median(rel_err, 0)
    mean_csmf_accuracy = csmf_accuracy.mean(0)
    median_csmf_accuracy = pl.median(csmf_accuracy, 0)
    mean_coverage_bycause = coverage.mean(0)
    mean_coverage = coverage.reshape(reps, T, J).mean(0).mean(1)
    percent_total_coverage = (coverage.reshape(reps, T, J).sum(2)==3).mean(0)
    mean_coverage = pl.array([[i for j in range(J)] for i in mean_coverage]).ravel()
    percent_total_coverage = pl.array([[i for j in range(J)] for i in percent_total_coverage]).ravel()

    models = pl.array([[model for j in range(J)] for i in range(T)]).ravel()
    true_cf = metrics.true_cf
    true_std = metrics.true_std
    std_bias = metrics.std_bias

    all = pl.np.core.records.fromarrays([models, cause[0], time[0], true_cf, true_std, std_bias, mean_abs_err, median_abs_err, mean_rel_err, median_rel_err, 
                                         mean_csmf_accuracy, median_csmf_accuracy, mean_coverage_bycause, mean_coverage, percent_total_coverage], 
                                        names=['model', 'cause', 'time', 'true_cf', 'true_std', 'std_bias', 'mean_abs_err', 'median_abs_err', 
                                         'mean_rel_err', 'median_rel_err', 'mean_csmf_accuracy', 'median_csmf_accuracy', 
                                         'mean_covearge_bycause', 'mean_coverage', 'percent_total_coverage'])   
    
    if save: 
        pl.rec2csv(all, '%s/%s_summary.csv' % (dir, model)) 
    else: 
        return all
Пример #4
0
def sites_and_env(session, species, layer_names, glob_name, glob_channels, buffer_width, n_pseudoabsences, dblock=None, simdata=False):
    """
    Queries the DB to get a list of locations. Writes it out along with matching 
    extractions of the requested layers to a temporary csv file, which serves the 
    dual purpose of caching the extraction and making it easier to get data into 
    the BRT package.
    """

    breaks, x, found, zero, others_found, multipoints, eo = sites_as_ndarray(session, species)
    
    if simdata:
        print 'Process %i simulating presences for species %s.'%(multiprocessing.current_process().ident,species[1])
        x = get_pseudoabsences(eo, -1, n_pseudoabsences, layer_names, glob_name)
        found = np.ones(n_pseudoabsences)
        

    fname = hashlib.sha1(str(x)+found.tostring()+\
            glob_name+'channel'.join([str(i) for i in glob_channels])+\
            'layer'.join(layer_names)).hexdigest()+'.csv'
            
    pseudoabsences = get_pseudoabsences(eo, buffer_width, n_pseudoabsences, layer_names, glob_name)        
            
    x_found = x[np.where(found)]

    x = np.vstack((x_found, pseudoabsences))
    found = np.concatenate((np.ones(len(x_found)), np.zeros(n_pseudoabsences)))

    if fname in os.listdir('anopheles-caches'):
        pass
    else:

        # Makes list of (key, value) tuples
        env_layers = map(lambda ln: extract_environment(ln, x, lock=dblock), layer_names)\
                + map(lambda ch: (os.path.basename(glob_name)+'_'+str(ch), extract_environment(glob_name,x,\
                    postproc=lambda d: d==ch, id_=ch, lock=dblock)[1]), glob_channels)

        arrays = [(found>0).astype('int')] + [l[1] for l in env_layers]
        names = ['found'] + [l[0] for l in env_layers]

        data = np.rec.fromarrays(arrays, names=','.join(names))
        nancheck = np.array([np.any(np.isnan(row.tolist())) for row in data])
        if np.any(nancheck):
            print 'There were some NaNs in the data, probably points in the sea'

        singletons = 0
        for e in env_layers:
            if len(set(e[1][np.where(True-np.isnan(e[1]))]))==1:
                singletons += 1
        if singletons == len(env_layers):
            raise ValueError, 'All environmental layer evaluations contained only single values.'
        
        data = data[np.where(True-nancheck)]
        rec2csv(data, os.path.join('anopheles-caches',fname))

    return fname, pseudoabsences, x
Пример #5
0
def knockout_uniformly_at_random(in_fname='noisy_data.csv', out_fname='missing_noisy_data.csv', pct=20.):
    """ replace data.csv y column with uniformly random missing entries

    Parameters
    ----------
    pct : float, percent to knockout
    """
    data = pl.csv2rec(in_fname)
    for i, row in enumerate(data):
        if pl.rand() < pct/100.:
            data[i].y = pl.nan
    pl.rec2csv(data, out_fname)
Пример #6
0
def knockout_uniformly_at_random(in_fname='noisy_data.csv', out_fname='missing_noisy_data.csv', pct=20.):
    """ replace data.csv y column with uniformly random missing entries

    Parameters
    ----------
    pct : float, percent to knockout
    """
    data = pl.csv2rec(in_fname)
    for i, row in enumerate(data):
        if pl.rand() < pct/100.:
            data[i].y = pl.nan
    pl.rec2csv(data, out_fname)
Пример #7
0
def compile_all_results (scenarios, dir='../data'):
    """
    Compiles the results across multiple scenarios produced by running run_on_cluster on each 
    one into a single sv file. The specified directory must be where where the results of 
    running run_on_cluster for each scenario are stored (each is a sub-directory named v0, v1, etc.)
    and is also where the output from this function will be saved.    
    """

    models = []
    causes = []
    time = []
    true_cf = []
    true_std = []
    std_bias = []
    mean_abs_err = []
    median_abs_err = []
    mean_rel_err = []
    median_rel_err = []
    mean_csmf_accuracy = []
    median_csmf_accuracy = []
    mean_coverage_bycause = []
    mean_coverage = []
    percent_total_coverage = []
    scenario = []

    for i in range(scenarios):
        for j in ['bad_model', 'latent_simplex']: 
            read = csv.reader(open('%s/v%s/%s_summary.csv' % (dir, i, j)))
            read.next()
            for row in read: 
                models.append(row[0])
                causes.append(row[1])
                time.append(row[2])
                true_cf.append(row[3])
                true_std.append(row[4])
                std_bias.append(row[5])
                mean_abs_err.append(row[6])
                median_abs_err.append(row[7])
                mean_rel_err.append(row[8])
                median_rel_err.append(row[9])
                mean_csmf_accuracy.append(row[10])
                median_csmf_accuracy.append(row[11])
                mean_coverage_bycause.append(row[12])
                mean_coverage.append(row[13])
                percent_total_coverage.append(row[14])
                scenario.append(i)

    all = pl.np.core.records.fromarrays([scenario, models, time, true_cf, true_std, causes, mean_abs_err, median_abs_err, mean_rel_err, median_rel_err, 
                                         mean_csmf_accuracy, median_csmf_accuracy, mean_coverage_bycause, mean_coverage, percent_total_coverage], 
                                        names=['scenario', 'model', 'time', 'true_cf', 'true_std', 'cause', 'mean_abs_err', 'median_abs_err', 
                                         'mean_rel_err', 'median_rel_err', 'mean_csmf_accuracy', 'median_csmf_accuracy', 
                                         'mean_covearge_bycause', 'mean_coverage', 'percent_total_coverage'])
    pl.rec2csv(all, fname='%s/all_summary_metrics.csv' % (dir))  
Пример #8
0
def add_sampling_error(in_fname='data.csv', out_fname='noisy_data.csv', std=1.):
    """ add normally distributed noise to data.csv y column

    Parameters
    ----------
    std : float, or array of floats
      standard deviation of noise
    """
    data = pl.csv2rec(in_fname)
    if type(std) == float:
        std = std * pl.ones(len(data))
    for i, row in enumerate(data):
        data[i].y += std[i] * pl.randn(1)
        data[i].se += std[i]
    pl.rec2csv(data, out_fname)
Пример #9
0
def add_sampling_error(in_fname='data.csv', out_fname='noisy_data.csv', std=1.):
    """ add normally distributed noise to data.csv y column

    Parameters
    ----------
    std : float, or array of floats
      standard deviation of noise
    """
    data = pl.csv2rec(in_fname)
    if type(std) == float:
        std = std * pl.ones(len(data))
    for i, row in enumerate(data):
        data[i].y += std[i] * pl.randn(1)
        data[i].se += std[i]
    pl.rec2csv(data, out_fname)
Пример #10
0
def retrieve_estimates(preds, save=False, model='', dir='', i=0):
    """
    calculates the posterior mean for pi as well as the 95% hpd region and
    optionally saves this output
    """
    
    T, J = preds.shape[1:]
    mean = preds.mean(0).ravel()
    hpd = mc.utils.hpd(preds, 0.05)
    lower = hpd[:,:,0].ravel()
    upper = hpd[:,:,1].ravel()
    time =  pl.array([[t for j in range(J)] for t in range(T)]).ravel()
    cause = pl.array([[j for j in range(J)] for t in range(T)]).ravel()
    results = pl.np.core.records.fromarrays([time, cause, mean, lower, upper], 
                                            names=['time', 'cause', 'med', 'lower', 'upper'])
    
    if (save): 
        pl.rec2csv(results, '%s/%s_estimates%i.csv' % (dir, model, i))
    else: 
        return(results)
Пример #11
0
def retrieve_estimates(preds, save=False, model='', dir='', i=0):
    """
    calculates the posterior mean for pi as well as the 95% hpd region and
    optionally saves this output
    """

    T, J = preds.shape[1:]
    mean = preds.mean(0).ravel()
    hpd = mc.utils.hpd(preds, 0.05)
    lower = hpd[:, :, 0].ravel()
    upper = hpd[:, :, 1].ravel()
    time = pl.array([[t for j in range(J)] for t in range(T)]).ravel()
    cause = pl.array([[j for j in range(J)] for t in range(T)]).ravel()
    results = pl.np.core.records.fromarrays(
        [time, cause, mean, lower, upper],
        names=['time', 'cause', 'med', 'lower', 'upper'])

    if (save):
        pl.rec2csv(results, '%s/%s_estimates%i.csv' % (dir, model, i))
    else:
        return (results)
def trees_to_diagnostics(brt_evaluator, fname, species_name, n_pseudopresences, n_pseudoabsences, config_filename):
    """
    Takes the BRT evaluator and sees how well it does at predicting the training dataset.
    """

    from diagnostics import simple_assessments, roc, plot_roc_

    din = csv2rec(os.path.join('anopheles-caches',fname))
    found = din.found
    din = dict([(k,din[k]) for k in brt_evaluator.nice_tree_dict.iterkeys()])
    probs = pm.flib.invlogit(brt_evaluator(din))

    print 'Species %s: fraction %f correctly classified.'%(species_name, ((probs>.5)*found+(probs<.5)*(True-found)).sum()/float(len(probs)))

    result_dirname = get_result_dir(config_filename)
    
    resdict = {}
    for f in simple_assessments:
        resdict[f.__name__] = f(probs>.5, found)

    pstack = np.array([pm.rbernoulli(probs) for i in xrange(10000)])
    fp, tp, AUC = roc(pstack, found)
    resdict['AUC'] = AUC
    
    fout=file(os.path.join(result_dirname,'simple-diagnostics.txt'),'w')
    fout.write('presences: %i\n'%(found.sum()-n_pseudopresences))
    fout.write('pseudopresences: %i\n'%n_pseudopresences)
    fout.write('pseudoabsences: %i\n'%n_pseudoabsences)
    for k in resdict.iteritems():
        fout.write('%s: %s\n'%k)
    
    import pylab as pl
    pl.clf()
    plot_roc_(fp,tp,AUC)
    pl.savefig(os.path.join(result_dirname,'roc.pdf'))
    
    r = np.rec.fromarrays([fp,tp],names='false,true')
    rec2csv(r,os.path.join(result_dirname,'roc.csv'))
Пример #13
0
    def measure_fit(self):
        ''' Provide metrics of fit to determine how well the model performed '''
        # TODO: code up RMSE for non-holdout predictions
        if self.training_type == 'make predictions':
            print 'RMSE for non-holdout data not yet implemented'

        # calculate age-adjusted rates on the test data
        else:
            predicted = self.predictions[['country','year','age','pop','actual_deaths', 'mean_deaths', 'upper_deaths', 'lower_deaths']].view(np.recarray)
            predicted = recfunctions.append_fields(predicted, 'mean_rate', predicted.mean_deaths / predicted.pop * 100000.).view(np.recarray)
            predicted = recfunctions.append_fields(predicted, 'actual_rate', predicted.actual_deaths / predicted.pop * 100000.).view(np.recarray)
            predicted = recfunctions.append_fields(predicted, 'weight', np.ones(predicted.shape[0])).view(np.recarray)
            for a in self.age_list:
                predicted.weight[np.where(predicted.age==a)[0]] = self.age_weights.weight[np.where(self.age_weights.age==a)[0]]
            predicted.mean_rate = predicted.mean_rate * predicted.weight
            predicted.actual_rate = predicted.actual_rate * predicted.weight
            from matplotlib import mlab
            adj_rates = mlab.rec_groupby(predicted, ('country','year'), (('mean_rate', np.sum, 'adj_mean_rate'),('actual_rate', np.sum, 'adj_actual_rate')))

            # calculate RMSE/RMdSE
            err = adj_rates.adj_mean_rate - adj_rates.adj_actual_rate
            sq_err = err ** 2.
            mse = np.mean(sq_err)
            mdse = np.median(sq_err)
            rmse = np.sqrt(mse)
            rmdse = np.sqrt(mdse)

            # calculate AARE/MdARE
            abs_rel_err = np.abs(err / adj_rates.adj_actual_rate)
            aare = np.mean(abs_rel_err)
            mdare = np.median(abs_rel_err)

            # calculate coverage (age-specific, not age-adjusted)
            coverage = np.array((predicted.upper_deaths >= predicted.actual_deaths) & (predicted.lower_deaths <= predicted.actual_deaths)).astype(np.int).mean()

            # output fit metrics
            print 'Root Mean Square Error: ' + str(rmse), '\nRoot Median Square Error: ' + str(rmdse), '\nAverage Absolute Relative Error: ' + str(aare), '\nMedian Absolute Relative Error: ' + str(mdare), '\nCoverage: ' + str(coverage)
            pl.rec2csv(np.core.records.fromarrays([np.array(('rmse','rmdse','aare','mdare','coverage')),np.array((rmse,rmdse,aare,mdare,coverage))], names=['metric','value']), '/home/j/Project/Causes of Death/CoDMod/tmp/' + self.name + '_fits_' + self.cause + '_' + self.sex + '.csv')
Пример #14
0
        wshape[axis] = 2
        weights.shape = wshape
        sumval = weights.sum()
    return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval



# save basic estimates
model_estimates =   model.trace('estimate')[:]
mean_estimate =     model_estimates.mean(axis=0)
lower_estimate =    percentile(model_estimates, 2.5, axis=0)
upper_estimate =    percentile(model_estimates, 97.5, axis=0)
output =            pl.rec_append_fields(  rec =   data, 
                        names = ['mean', 'lower', 'upper'], 
                        arrs =  [mean_estimate, lower_estimate, upper_estimate])
pl.rec2csv(output, proj_dir + 'outputs/model results/spatial smoothing/SWRI_with_spatial.csv')


'''
### plot diagnostics
# setup plotting
#import matplotlib.pyplot as pp
#pp.switch_backend('acc')
plot_me = [mu_si, mu_ss, mu_ci, mu_cs, sigma_si, sigma_ss, sigma_ci, sigma_cs, state_intercepts, state_slopes, cause_intercepts, cause_slopes]

# plot traces
os.chdir(proj_dir + '/outputs/model results/simple random effects by state/mcmc plots/traces/')
for p in plot_me:
    mc.Matplot.plot(p, suffix='_trace')
    if len(p.shape) == 0:
        plt.close()
Пример #15
0
        weights = np.array(1)
        sumval = 1.0
    else:
        indexer[axis] = slice(i, i+2)
        j = i + 1
        weights = np.array([(j - index), (index - i)],float)
        wshape = [1]*sorted.ndim
        wshape[axis] = 2
        weights.shape = wshape
        sumval = weights.sum()
    return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval



# save basic estimates
model_estimates =   model.trace('estimate')[:]
mean_estimate =     model_estimates.mean(axis=0)
lower_estimate =    percentile(model_estimates, 2.5, axis=0)
upper_estimate =    percentile(model_estimates, 97.5, axis=0)
output =            pl.rec_append_fields(  rec =   data, 
                        names = ['mean', 'lower', 'upper'], 
                        arrs =  [mean_estimate, lower_estimate, upper_estimate])
pl.rec2csv(output, proj_dir + 'outputs/model results/spatial smoothing/' + mod_name + '_' + str(sex) + '_' + age + '.csv')

# save draws
draws =     pl.rec_append_fields(
                    rec =   data,
                    names = ['draw_' + str(i+1) for i in range(100)],
                    arrs =  [model.trace('estimate')[i] for i in range(100)])
pl.rec2csv(draws, proj_dir + 'outputs/model results/spatial smoothing/' + mod_name + '_draws_' + str(sex) + '_' + age + '.csv')
Пример #16
0
	tfr_draws[draw,:] = Realization(M, C)(predictionyears)


# collapse across draws
# note: space transformations need to be performed at the draw level
logit_est = gpr.collapse_sims(tfr_draws)
unlogit_est = gpr.collapse_sims(np.exp(tfr_draws)*tfr_bound/(1+np.exp(tfr_draws))) # get the inverse logit



os.chdir('FILEPATH')

all_est = []
for i in range(len(predictionyears)):
	all_est.append((cc, predictionyears[i], unlogit_est['med'][i], unlogit_est['lower'][i], unlogit_est['upper'][i]))
all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('year', '<f8'), ('med', '<f8'), ('lower', '<f8'), ('upper', '<f8')])
pl.rec2csv(all_est, 'gpr_%s.txt' %(cc+'_'+ str(best_amp2x) + '_' + str(best_scale)))

# save the sims 
all_sim = []
for i in range(len(predictionyears)):
	for s in range(draws):
		all_sim.append((cc, predictionyears[i], s, np.exp(tfr_draws[s][i])*tfr_bound/ (1+np.exp(tfr_draws[s][i])) ))


all_sim = pl.array(all_sim, [('ihme_loc', '|S32'), ('year', '<f8'), ('sim', '<f8'), ('fert', '<f8')])


pl.rec2csv(all_sim, 'gpr_%s_sim.txt' %(cc+ '_' + str(best_amp2x) + '_' + str(best_scale)))

Пример #17
0
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, test,
            spacetime_iters, top_submodel):
    # load in the data
    all_data = csv2rec(infile, use_mrecords=False)
    for m in range(number_submodels):
        if all_data['spacetime_' + str(m + 1)].dtype == 'float64':
            all_data = np.delete(
                all_data,
                np.where(np.isnan(all_data['spacetime_' + str(m + 1)]))[0],
                axis=0)

    # find the list of years for which we need to predict
    year_list = np.unique(all_data.year)

    # find the list of country/age groups
    country_age = np.array([
        str(all_data.iso3[i]) + '_' + str(all_data.age_group[i])
        for i in range(len(all_data))
    ])
    country_age_list = np.repeat(np.unique(country_age), len(year_list))

    # make empty arrays in which to store the results
    total_iters = np.sum(spacetime_iters)
    draws = [
        np.empty(len(country_age_list), 'float') for i in range(total_iters)
    ]
    if (top_submodel > 0):
        top_submodel_draws = [
            np.empty(len(country_age_list), 'float') for i in range(100)
        ]
    iso3 = np.empty(len(country_age_list), '|S3')
    age_group = np.empty(len(country_age_list), 'int')
    year = np.empty(len(country_age_list), 'int')

    # loop through country/age groups
    for ca in np.unique(country_age_list):
        print('GPRing ' + ca)

        # subset the data for this particular country/age
        ca_data = all_data[country_age == ca]

        # subset just the observed data
        if ca_data['lt_cf'].dtype != '|O8':
            ca_observed = ca_data[(np.isnan(ca_data['lt_cf']) == 0)
                                  & (ca_data['test_' + test] == 0)]
            if len(ca_observed) > 1:
                has_data = True
            else:
                has_data = False
        else:
            has_data = False

        # keep track of how many iterations have been added for this model
        iter_counter = 0

        # loop through each submodel
        for m in range(number_submodels):

            # identify the dependent variable for this model
            dv = dv_list[m]

            # continue making predictions if we actually need draws for this model
            if (spacetime_iters[m] > 0) or (m + 1 == top_submodel):

                # skip models with no spacetime results
                if all_data['spacetime_' + str(m + 1)].dtype != 'float64':
                    for i in range(spacetime_iters[m]):
                        draws[iter_counter][country_age_list == ca] = np.NaN
                        iter_counter += 1
                    if (m + 1 == top_submodel):
                        for i in range(100):
                            top_submodel_draws[i][country_age_list ==
                                                  ca] = np.NaN
                    continue

                # make a list of the spacetime predictions
                ca_prior = np.array([
                    np.mean(ca_data['spacetime_' +
                                    str(m + 1)][ca_data.year == y])
                    for y in year_list
                ])

                # find the amplitude for this country/age
                amplitude = np.mean(ca_data['spacetime_amplitude_' +
                                            str(m + 1)])

                # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR
                def mean_function(x):
                    return np.interp(x, year_list, ca_prior)

                # setup the covariance function
                M = gp.Mean(mean_function)
                C = gp.Covariance(eval_fun=gp.matern.euclidean,
                                  diff_degree=2,
                                  amp=amplitude,
                                  scale=scale)

                # observe the data if there is any
                if has_data:
                    gp.observe(M=M,
                               C=C,
                               obs_mesh=ca_observed.year,
                               obs_V=ca_observed['spacetime_data_variance_' +
                                                 str(m + 1)],
                               obs_vals=ca_observed[dv])

                # draw realizations from the data
                realizations = [
                    gp.Realization(M, C) for i in range(spacetime_iters[m])
                ]

                # save the data for this country/age into the results array
                iso3[country_age_list == ca] = ca[0:3]
                age_group[country_age_list == ca] = ca[4:]
                year[country_age_list == ca] = year_list.T
                for i in range(spacetime_iters[m]):
                    try:
                        draws[iter_counter][country_age_list ==
                                            ca] = realizations[i](year_list)
                    except:
                        print('Failure in ' + ca)
                    iter_counter += 1

                # if it's the top submodel, do 100 additional draws
                if (m + 1 == top_submodel):
                    realizations = [gp.Realization(M, C) for i in range(100)]
                    for i in range(100):
                        try:
                            top_submodel_draws[i][country_age_list ==
                                                  ca] = realizations[i](
                                                      year_list)
                        except:
                            print('Failure in ' + ca)

    # save the results
    print('Saving GPR results')
    names = ['iso3', 'age_group', 'year']
    results = np.core.records.fromarrays([iso3, age_group, year], names=names)
    for i in range(total_iters):
        results = recfunctions.append_fields(results,
                                             'ensemble_d' + str(i + 1),
                                             draws[i])
    if (top_submodel > 0):
        for i in range(100):
            results = recfunctions.append_fields(results,
                                                 'top_submodel_d' + str(i + 1),
                                                 top_submodel_draws[i])
    rec2csv(results, outfile)
Пример #18
0
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, test, spacetime_iters, top_submodel):
    # load in the data
    all_data = csv2rec(infile, use_mrecords=False)
    for m in range(number_submodels):
        if all_data['spacetime_' + str(m+1)].dtype == 'float64':
            all_data = np.delete(all_data, np.where(np.isnan(all_data['spacetime_' + str(m+1)]))[0], axis=0)

    # find the list of years for which we need to predict
    year_list = np.unique(all_data.year)

    # find the list of country/age groups
    country_age = np.array([str(all_data.iso3[i]) + '_' + str(all_data.age_group[i]) for i in range(len(all_data))])
    country_age_list = np.repeat(np.unique(country_age), len(year_list))

    # make empty arrays in which to store the results
    total_iters = np.sum(spacetime_iters)
    draws = [np.empty(len(country_age_list), 'float') for i in range(total_iters)]
    if (top_submodel > 0):
        top_submodel_draws = [np.empty(len(country_age_list), 'float') for i in range(100)]
    iso3 = np.empty(len(country_age_list), '|S3')
    age_group = np.empty(len(country_age_list), 'int')
    year = np.empty(len(country_age_list), 'int')

    # loop through country/age groups
    for ca in np.unique(country_age_list):
        print('GPRing ' + ca)

        # subset the data for this particular country/age
        ca_data = all_data[country_age==ca]

        # subset just the observed data
        if ca_data['lt_cf'].dtype != '|O8':
            ca_observed = ca_data[(np.isnan(ca_data['lt_cf'])==0) & (ca_data['test_' + test]==0)]
            if len(ca_observed) > 1:
                has_data = True
            else:
                has_data = False
        else:
            has_data = False

        # keep track of how many iterations have been added for this model
        iter_counter = 0

        # loop through each submodel
        for m in range(number_submodels):

            # identify the dependent variable for this model
            dv = dv_list[m]

            # continue making predictions if we actually need draws for this model
            if (spacetime_iters[m] > 0) or (m+1 == top_submodel):

                # skip models with no spacetime results
                if all_data['spacetime_' + str(m+1)].dtype != 'float64':
                    for i in range(spacetime_iters[m]):
                        draws[iter_counter][country_age_list==ca] = np.NaN
                        iter_counter += 1
                    if (m+1 == top_submodel):
                        for i in range(100):
                            top_submodel_draws[i][country_age_list==ca] = np.NaN
                    continue

                # make a list of the spacetime predictions
                ca_prior = np.array([np.mean(ca_data['spacetime_' + str(m+1)][ca_data.year==y]) for y in year_list])

                # find the amplitude for this country/age
                amplitude = np.mean(ca_data['spacetime_amplitude_' + str(m+1)])

                # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR
                def mean_function(x) :
                    return np.interp(x, year_list, ca_prior)

                # setup the covariance function
                M = gp.Mean(mean_function)
                C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale)

                # observe the data if there is any
                if has_data:
                    gp.observe(M=M, C=C, obs_mesh=ca_observed.year, obs_V=ca_observed['spacetime_data_variance_' + str(m+1)], obs_vals=ca_observed[dv])

                # draw realizations from the data
                realizations = [gp.Realization(M, C) for i in range(spacetime_iters[m])]

                # save the data for this country/age into the results array
                iso3[country_age_list==ca] = ca[0:3]
                age_group[country_age_list==ca] = ca[4:]
                year[country_age_list==ca] = year_list.T
                for i in range(spacetime_iters[m]):
                    try:
                        draws[iter_counter][country_age_list==ca] = realizations[i](year_list)
                    except:
                        print('Failure in ' + ca)
                    iter_counter += 1

                # if it's the top submodel, do 100 additional draws
                if (m+1 == top_submodel):
                    realizations = [gp.Realization(M, C) for i in range(100)]
                    for i in range(100):
                        try:
                            top_submodel_draws[i][country_age_list==ca] = realizations[i](year_list)
                        except:
                            print('Failure in ' + ca)

    # save the results
    print('Saving GPR results')
    names = ['iso3','age_group','year']
    results = np.core.records.fromarrays([iso3,age_group,year], names=names)
    for i in range(total_iters):
        results = recfunctions.append_fields(results, 'ensemble_d' + str(i+1), draws[i])
    if (top_submodel > 0):
        for i in range(100):
            results = recfunctions.append_fields(results, 'top_submodel_d' + str(i+1), top_submodel_draws[i])
    rec2csv(results, outfile)
Пример #19
0
def rec2csv_2d(Y, fname):
    """
    write a 2-dimensional recarray to a csv file
    """

    pl.rec2csv(pl.np.core.records.fromarrays(Y.T), fname)
Пример #20
0
        for i in range(len(predictionyears)):
            all_est.append((cc, ss, predictionyears[i], unlog_est['med'][i],
                            unlog_est['lower'][i], unlog_est['upper'][i]))

        all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('sex', '|S32'),
                                     ('year', '<f8'), ('mort_med', '<f8'),
                                     ('mort_lower', '<f8'),
                                     ('mort_upper', '<f8')])

        ## no need to save the summary if we're doing the HIV draws version
        if (huncert == int(1)):
            os.chdir('strPath')
        else:
            os.chdir('%s/strPath' %
                     ('/home/j' if os.name == 'posix' else 'J:'))
            pl.rec2csv(all_est, 'gpr_%s_%s_not_scaled.txt' % (cc, ss))

        # save the sims
        all_sim = []

        for i in range(len(predictionyears)):
            for s in range(dr):
                if (transform == 'log10'):
                    all_sim.append((cc, ss, predictionyears[i], s,
                                    10**d[s][i]))  # log base 10 space
                elif (transform == 'ln'):
                    all_sim.append((cc, ss, predictionyears[i], s,
                                    math.e**d[s][i]))  # natural log space
                elif (transform == 'logit'):
                    all_sim.append((cc, ss, predictionyears[i], s,
                                    ((math.e**d[s][i]) /
Пример #21
0
        weights = np.array(1)
        sumval = 1.0
    else:
        indexer[axis] = slice(i, i+2)
        j = i + 1
        weights = np.array([(j - index), (index - i)],float)
        wshape = [1]*sorted.ndim
        wshape[axis] = 2
        weights.shape = wshape
        sumval = weights.sum()
    return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval



# save basic estimates
model_estimates =   model.trace('estimate')[:]
mean_estimate =     model_estimates.mean(axis=0)
lower_estimate =    percentile(model_estimates, 2.5, axis=0)
upper_estimate =    percentile(model_estimates, 97.5, axis=0)
output =            pl.rec_append_fields(  rec =   data, 
                        names = ['mean', 'lower', 'upper'], 
                        arrs =  [mean_estimate, lower_estimate, upper_estimate])
pl.rec2csv(output, proj_dir + 'outputs/model results/random effects plus flex time/' + mod_name + '_' + str(sex) + '_' + age + '.csv')

# save draws
draws =     pl.rec_append_fields(
                    rec =   data,
                    names = ['draw_' + str(i+1) for i in range(100)],
                    arrs =  [model.trace('estimate')[i] for i in range(100)])
pl.rec2csv(draws, proj_dir + 'outputs/model results/random effects plus flex time/' + mod_name + '_draws_' + str(sex) + '_' + age + '.csv')
Пример #22
0
        for i in range(len(predictionyears)):
            all_est.append((cc, ss, predictionyears[i], unlog_est['med'][i],
                            unlog_est['lower'][i], unlog_est['upper'][i]))

        all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('sex', '|S32'),
                                     ('year', '<f8'), ('mort_med', '<f8'),
                                     ('mort_lower', '<f8'),
                                     ('mort_upper', '<f8')])

        ## no need to save the summary if we're doing the HIV draws version
        if (huncert == int(1)):
            os.chdir('filepath')
        else:
            os.chdir('filepath')
            pl.rec2csv(all_est, 'filepath' % (cc, ss))

        # save the sims
        all_sim = []

        for i in range(len(predictionyears)):
            for s in range(dr):
                if (transform == 'log10'):
                    all_sim.append((cc, ss, predictionyears[i], s,
                                    10**d[s][i]))  # log base 10 space
                elif (transform == 'ln'):
                    all_sim.append((cc, ss, predictionyears[i], s,
                                    math.e**d[s][i]))  # natural log space
                elif (transform == 'logit'):
                    all_sim.append((cc, ss, predictionyears[i], s,
                                    ((math.e**d[s][i]) /
Пример #23
0
    def predict_test(self, save_csv=False):
        ''' Use the MCMC traces to predict the test data '''
        # setup constants
        num_test_rows = self.test_data.shape[0]
        num_iters = self.mod_mc.beta.trace().shape[0]

        # indices
        t_index = dict([(t, i) for i, t in enumerate(self.year_list)])
        a_index = dict([(a, i) for i, a in enumerate(self.age_list)])

        # fixed effects
        X = np.array([self.test_data['x%d'%i] for i in range(self.mod_mc.beta.value.shape[0])])
        BX = np.dot(self.mod_mc.beta.trace(), X)

        # exposure
        '''
        if self.training_type == 'make predictions':
            E = np.ones((num_iters, num_test_rows))*self.test_data.envelope
        else:
            E = np.random.binomial(np.round(self.test_data.envelope).astype('int'), (self.test_data.sample_size/self.test_data.envelope), (num_iters, num_test_rows))
        '''
        E = np.ones((num_iters, num_test_rows))*self.test_data.envelope

        # pi_s
        s_index = [np.where(self.test_data.super_region==s) for s in self.super_region_list]
        t_by_s = [[t_index[self.test_data.year[j]] for j in s_index[s][0]] for s in range(len(self.super_region_list))]
        a_by_s = [[a_index[self.test_data.age[j]] for j in s_index[s][0]] for s in range(len(self.super_region_list))]
        pi_s = np.zeros((num_iters, num_test_rows))
        for s in range(len(self.super_region_list)):
            pi_s[:,s_index[s][0]] = self.mod_mc.pi_s_list.trace()[:,s][:,a_by_s[s],t_by_s[s]]
        self.test_s_index = s_index

        # pi_r
        r_index = [np.where(self.test_data.region==r) for r in self.region_list]
        t_by_r = [[t_index[self.test_data.year[j]] for j in r_index[r][0]] for r in range(len(self.region_list))]
        a_by_r = [[a_index[self.test_data.age[j]] for j in r_index[r][0]] for r in range(len(self.region_list))]
        pi_r = np.zeros((num_iters, num_test_rows))
        for r in range(len(self.region_list)):
            pi_r[:,r_index[r][0]] = self.mod_mc.pi_r_list.trace()[:,r][:,a_by_r[r],t_by_r[r]]
        self.test_r_index = r_index

        # pi_c
        c_index = [np.where(self.test_data.country==c) for c in self.country_list]
        t_by_c = [[t_index[self.test_data.year[j]] for j in c_index[c][0]] for c in range(len(self.country_list))]
        a_by_c = [[a_index[self.test_data.age[j]] for j in c_index[c][0]] for c in range(len(self.country_list))]
        pi_c = np.zeros((num_iters, num_test_rows))
        for c in range(len(self.country_list)):
            pi_c[:,c_index[c][0]] = self.mod_mc.pi_c_list.trace()[:,c][:,a_by_c[c],t_by_c[c]]	
        self.test_c_index = c_index

        # make predictions
        import os
        os.chdir('/home/j/Project/Causes of Death/CoDMod/codmod2/')
        import percentile
        predictions = np.exp(BX + np.log(E) + pi_s + pi_r + pi_c)
        mean = predictions.mean(axis=0)
        lower = percentile.percentile(predictions, 2.5, axis=0)
        upper = percentile.percentile(predictions, 97.5, axis=0)
        self.predictions = self.test_data[['country','region','super_region','year','age','pop']]
        self.predictions = recfunctions.append_fields(self.predictions, 'mean_deaths', mean)
        self.predictions = recfunctions.append_fields(self.predictions, 'lower_deaths', lower)
        self.predictions = recfunctions.append_fields(self.predictions, 'upper_deaths', upper)
        if self.training_type != 'make predictions':
            self.predictions = recfunctions.append_fields(self.predictions, 'actual_deaths', self.test_data.cf*self.test_data.envelope)
        self.predictions = self.predictions.view(np.recarray)

        # save the predictions
        if save_csv == True:
            pl.rec2csv(self.predictions, '/home/j/Project/Causes of Death/CoDMod/tmp/' + self.name + '_predictions_' + self.cause + '_' + self.sex + '.csv')
Пример #24
0
    def load(self, save_cache=False, use_cache=False, dir='/home/j/Project/Causes of Death/CoDMod/tmp/'):
        '''
        If use_cache=True, loads data from a previous call to the MySQL server.
        Otherwise, loads codmod data from the MySQL server.
        The resulting query will get all the data for a specified cause and sex, plus any covariates specified.
        If save_cache is True, then the results from this will be saved as csvs.
        '''
        # use cached data if specified
        if use_cache == True:
            self.use_cache(dir)
        
        # otherwise, load in the data from MySQL
        else:
        
            # make the sql covariate query
            covs = ''
            for i in list(set(self.covariates_untransformed)):
                if i != 'year':
                    covs = covs + i + ', '
            covs = covs[0:-2]

            # load observed deaths plus covariates
            obs_sql = 'SELECT iso3 as country, a.region, a.super_region, age, year, sex, cf, sample_size, a.envelope, a.pop, ' + covs + ' FROM full_cod_database AS a LEFT JOIN all_covariates USING (iso3,year,sex,age) WHERE a.cod_id="' + self.cause + '";'
            obs = mysql_to_recarray(self.cursor, obs_sql)
            obs = obs[np.where((obs.year >= self.year_range[0]) & (obs.year <= self.year_range[1]) & (obs.age >= self.age_range[0]) & (obs.age <= self.age_range[1]) & (obs.sex == self.sex_num))[0]]

            # load in just covariates (for making predictions)
            all_sql = 'SELECT iso3 as country, region, super_region, age, year, sex, envelope, pop, ' + covs + ' FROM all_covariates;'
            all = mysql_to_recarray(self.cursor, all_sql)
            all = all[np.where((all.year >= self.year_range[0]) & (all.year <= self.year_range[1]) & (all.age >= self.age_range[0]) & (all.age <= self.age_range[1]) & (all.sex == self.sex_num))[0]]
            
            # get rid of rows for which covariates are unavailable
            for i in list(set(self.covariates_untransformed)):
                all = np.delete(all, np.where(np.isnan(all[i]))[0], axis=0)
                obs = np.delete(obs, np.where(np.isnan(obs[i]))[0], axis=0)

            # remove observations in which the CF is missing or outside of (0,1), or where sample size/envelope is missing
            obs = np.delete(obs, np.where((np.isnan(obs.cf)) | (obs.cf > 1.) | (obs.cf < 0.) | (np.isnan(obs.sample_size)) | (obs.sample_size < 1.) | np.isnan(obs.envelope))[0], axis=0)

            # make lists of all the countries/regions/ages/years to predict for
            self.country_list = np.unique(all.country)
            self.region_list = np.unique(all.region)
            self.super_region_list = np.unique(all.super_region)
            self.age_list = np.unique(all.age)
            self.year_list = np.unique(all.year)

            # apply a moving average (5 year window) on cause fractions of 0 or 1, or where sample size is less than 100
            age_lookups = {}
            for a in self.age_list:
                age_lookups[a] = np.where(obs.age == a)[0]
            country_lookups = {}
            country_age_lookups = {}
            for c in self.country_list:
                country_lookups[c] = np.where(obs.country == c)[0]
                for a in self.age_list:
                    country_age_lookups[c+'_'+str(a)] = np.intersect1d(country_lookups[c], age_lookups[a])
            year_window_lookups = {}
            for y in range(self.year_range[0],self.year_range[1]+1):
                year_window_lookups[y] = np.where((obs.year >= y-2.) & (obs.year <= y+2.))[0]
            smooth_me = np.where((obs.cf==0.) | (obs.cf==1.) | (obs.sample_size<100.))[0]
            for i in smooth_me:
                obs.cf[i] = obs.cf[np.intersect1d(country_age_lookups[obs.country[i]+'_'+str(obs.age[i])],year_window_lookups[obs.year[i]])].mean()

            # for cases in which the CF is still 0 or 1 after the moving average, use the smallest/largest non-0/1 CF observed in that region-age
            region_age_lookups = {}
            region_lookups = {}
            for r in self.region_list:
                region_lookups[r] = np.where(obs.region == r)[0]
                for a in self.age_list:
                    region_age_lookups[str(r)+'_'+str(a)] = np.intersect1d(region_lookups[r], age_lookups[a])
            validcfs = np.where((obs.cf>0.) & (obs.cf<1.))[0]
            for i in np.where(obs.cf==0.)[0]:
                candidates = np.intersect1d(region_age_lookups[str(obs.region[i])+'_'+str(obs.age[i])], validcfs)
                if candidates.shape[0] == 0:
                    obs.cf[i] = 0.
                else:
                    obs.cf[i] = obs.cf[candidates].min()
            for i in np.where(obs.cf==1.)[0]:
                candidates = np.intersect1d(region_age_lookups[str(obs.region[i])+'_'+str(obs.age[i])], validcfs)
                if candidates.shape[0] == 0:
                    obs.cf[i] = 1.
                else:
                    obs.cf[i] = obs.cf[candidates].max()

            # finally, any CF that is still 0 or 1 after the above corrections should simply be dropped
            obs = np.delete(obs, np.where((obs.cf == 0.) | (obs.cf == 1.))[0], axis=0)
            
            # we treat our envelope as truth, so never allow sample size to exceed it
            shrink_me = np.where(obs.sample_size > obs.envelope*.999)[0]
            obs.sample_size[shrink_me] = obs.envelope[shrink_me]*.999
            
            # make covariate matrices (including transformations and normalization)
            obs_vectors = [obs.country, obs.region, obs.super_region, obs.year, obs.age, obs.cf, obs.sample_size, obs.envelope, obs.pop, np.ones(obs.shape[0])]
            obs_names = ['country', 'region', 'super_region', 'year', 'age', 'cf', 'sample_size', 'envelope', 'pop', 'x0']
            all_vectors = [all.country, all.region, all.super_region, all.year, all.age, all.envelope, all.pop, np.ones(all.shape[0])]
            all_names = ['country', 'region', 'super_region', 'year', 'age', 'envelope', 'pop', 'x0']
            self.covariate_dict = {'x0': 'constant'}
            for i in range(len(self.covariate_list)):
                a = all[self.covariates_untransformed[i]]
                o = obs[self.covariates_untransformed[i]]
                if self.covariate_transformations[i] == 'ln':
                    a = np.log(a)
                    o = np.log(o)
                elif self.covariate_transformations[i] == 'ln+sq':
                    a = (np.log(a))**2
                    o = (np.log(o))**2
                elif self.covariate_transformations[i] == 'sq':
                    a = a**2
                    o = o**2
                if self.normalize == True:
                    cov_mean = np.mean(a)
                    cov_sd = np.std(a)
                    a = ((a-cov_mean)/cov_sd)
                    o = ((o-cov_mean)/cov_sd)
                all_vectors.append(a)
                all_names.append('x' + str(i+1))
                obs_vectors.append(o)
                obs_names.append('x' + str(i+1))
                self.covariate_dict['x' + str(i+1)] = self.covariate_list[i]

            # create age dummies if specified
            if self.age_dummies == True:
                pre_ref = 1
                for i,j in enumerate(self.age_list):
                    if j == self.age_ref:
                        pre_ref = 0
                    elif pre_ref == 1:
                        all_vectors.append(np.array(all.age==j).astype(np.float))
                        all_names.append('x' + str(len(self.covariate_list)+i+1))
                        obs_vectors.append(np.array(obs.age==j).astype(np.float))
                        obs_names.append('x' + str(len(self.covariate_list)+i+1))
                        self.covariate_dict['x' + str(len(self.covariate_list)+i+1)] = 'Age ' + str(j)
                    else:
                        all_vectors.append(np.array(all.age==j).astype(np.float))
                        all_names.append('x' + str(len(self.covariate_list)+i))
                        obs_vectors.append(np.array(obs.age==j).astype(np.float))
                        obs_names.append('x' + str(len(self.covariate_list)+i))
                        self.covariate_dict['x' + str(len(self.covariate_list)+i)] = 'Age ' + str(j)
            
            # return the prediction and observation matrices
            self.prediction_matrix = np.core.records.fromarrays(all_vectors, names=all_names)
            self.observation_matrix = np.core.records.fromarrays(obs_vectors, names=obs_names)
                
            # prep all the in-sample data
            self.data_rows = self.observation_matrix.shape[0]
            print 'Data Rows:', self.data_rows
            self.training_split()

            # cache the data if requested
            if save_cache == True:
                pl.rec2csv(self.prediction_matrix, '/home/j/Project/Causes of Death/CoDMod/tmp/prediction_matrix_' + self.cause + '_' + self.sex + '.csv')
                pl.rec2csv(self.observation_matrix, '/home/j/Project/Causes of Death/CoDMod/tmp/observation_matrix_' + self.cause + '_' + self.sex + '.csv')

        # load in age weights for creating age adjusted rates later
        age_weights = mysql_to_recarray(self.cursor, 'SELECT age,weight FROM age_weights;')
        age_weights = recfunctions.append_fields(age_weights, 'keep', np.zeros(age_weights.shape[0])).view(np.recarray)
        for a in self.age_list:
            age_weights.keep[np.where(age_weights.age==a)[0]] = 1
        age_weights = np.delete(age_weights, np.where(age_weights.keep==0)[0], axis=0)
        age_weights.weight = age_weights.weight/age_weights.weight.sum()
        self.age_weights = age_weights
Пример #25
0
    def predict_test(self, save_csv=False):
        ''' Use the MCMC traces to predict the test data '''
        # setup constants
        num_test_rows = self.test_data.shape[0]
        num_iters = self.approxs['beta'].shape[0]

        # indices
        t_index = dict([(t, i) for i, t in enumerate(self.year_list)])
        a_index = dict([(a, i) for i, a in enumerate(self.age_list)])

        # fixed effects
        X = np.array([self.test_data['x%d'%i] for i in range(self.mod_mc.beta.value.shape[0])])
        BX = np.dot(self.approxs['beta'], X)

        # exposure
        '''
        if self.training_type == 'make predictions':
            E = np.ones((num_iters, num_test_rows))*self.test_data.envelope
        else:
            E = np.random.binomial(np.round(self.test_data.envelope).astype('int'), (self.test_data.sample_size/self.test_data.envelope), (num_iters, num_test_rows))
        '''
        E = np.ones((num_iters, num_test_rows))*self.test_data.envelope

        # interpolation parameters
        x_samples = self.sample_points[:,0]
        y_samples = self.sample_points[:,1]
        xb = self.age_list[0]
        xe = self.age_list[-1]
        yb = self.year_list[0]
        ye = self.year_list[-1]
        kx = 3 if len(self.age_samples) > 3 else len(self.age_samples)-1
        ky = 3 if len(self.year_samples) > 3 else len(self.year_samples)-1
        
        # pi_s
        s_index = [np.where(self.test_data.super_region==s) for s in self.super_region_list]
        t_by_s = [[t_index[self.test_data.year[j]] for j in s_index[s][0]] for s in range(len(self.super_region_list))]
        a_by_s = [[a_index[self.test_data.age[j]] for j in s_index[s][0]] for s in range(len(self.super_region_list))]
        pi_s = np.zeros((num_iters, num_test_rows))
        for s in range(len(self.super_region_list)):
            for i in range(num_iters):
                interpolator = interpolate.bisplrep(x=x_samples, y=y_samples, z=self.approxs['pi_s_'+str(s)][i], xb=xb, xe=xe, yb=yb, ye=ye, kx=kx, ky=ky)
                pi_s[i,s_index[s][0]] = interpolate.bisplev(x=self.age_list, y=self.year_list, tck=interpolator)[a_by_s[s],t_by_s[s]]
            mean_pi_s = pi_s[:,s_index[s][0]].mean(axis=1)
            pi_s[:,s_index[s][0]] = pi_s[:,s_index[s][0]][np.argsort(mean_pi_s)]
        
        # pi_r
        r_index = [np.where(self.test_data.region==r) for r in self.region_list]
        t_by_r = [[t_index[self.test_data.year[j]] for j in r_index[r][0]] for r in range(len(self.region_list))]
        a_by_r = [[a_index[self.test_data.age[j]] for j in r_index[r][0]] for r in range(len(self.region_list))]
        pi_r = np.zeros((num_iters, num_test_rows))
        for r in range(len(self.region_list)):
            for i in range(num_iters):
                interpolator = interpolate.bisplrep(x=x_samples, y=y_samples, z=self.approxs['pi_r_'+str(r)][i], xb=xb, xe=xe, yb=yb, ye=ye, kx=kx, ky=ky)
                pi_r[i,r_index[r][0]] = interpolate.bisplev(x=self.age_list, y=self.year_list, tck=interpolator)[a_by_r[r],t_by_r[r]]
            mean_pi_r = pi_r[:,r_index[r][0]].mean(axis=1)
            pi_r[:,r_index[r][0]] = pi_r[:,r_index[r][0]][np.argsort(mean_pi_r)]

        # pi_c
        c_index = [np.where(self.test_data.country==c) for c in self.country_list]
        t_by_c = [[t_index[self.test_data.year[j]] for j in c_index[c][0]] for c in range(len(self.country_list))]
        a_by_c = [[a_index[self.test_data.age[j]] for j in c_index[c][0]] for c in range(len(self.country_list))]
        pi_c = np.zeros((num_iters, num_test_rows))
        for c in range(len(self.country_list)):
            for i in range(num_iters):
                interpolator = interpolate.bisplrep(x=x_samples, y=y_samples, z=self.approxs['pi_c_'+str(c)][i], xb=xb, xe=xe, yb=yb, ye=ye, kx=kx, ky=ky)
                pi_c[i,c_index[c][0]] = interpolate.bisplev(x=self.age_list, y=self.year_list, tck=interpolator)[a_by_c[c],t_by_c[c]]
            mean_pi_c = pi_c[:,c_index[c][0]].mean(axis=1)
            pi_s[:,c_index[c][0]] = pi_c[:,c_index[c][0]][np.argsort(mean_pi_c)]

        # make predictions
        import os
        os.chdir('/home/j/Project/Causes of Death/CoDMod/codmod2/')
        import percentile
        predictions = np.exp(BX + np.log(E) + pi_s + pi_r + pi_c)
        mean = predictions.mean(axis=0)
        lower = percentile.percentile(predictions, 2.5, axis=0)
        upper = percentile.percentile(predictions, 97.5, axis=0)
        self.predictions = self.test_data[['country','region','super_region','year','age','pop']]
        self.predictions = recfunctions.append_fields(self.predictions, 'mean_deaths', mean)
        self.predictions = recfunctions.append_fields(self.predictions, 'lower_deaths', lower)
        self.predictions = recfunctions.append_fields(self.predictions, 'upper_deaths', upper)
        if self.training_type != 'make predictions':
            self.predictions = recfunctions.append_fields(self.predictions, 'actual_deaths', self.test_data.cf*self.test_data.envelope)
        self.predictions = self.predictions.view(np.recarray)

        # save the predictions
        if save_csv == True:
            pl.rec2csv(self.predictions, '/home/j/Project/Causes of Death/CoDMod/tmp/' + self.name + '_predictions_' + self.cause + '_' + self.sex + '.csv')
## find mean and standard error, drawing from M and C
draws = 1000
mort_draws = np.zeros((draws, len(predictionyears)))
gpr_seeds = [x + 123456 for x in range(1, 1001)]
for draw in range(draws):
    np.random.seed(gpr_seeds[draw])
    mort_draws[draw, :] = Realization(M, C)(predictionyears)

logit_est = gpr.collapse_sims(mort_draws)
unlogit_est = gpr.collapse_sims(mort_draws)

os.chdir('FILEPATH' + sex + '_' + age + '/')
all_est = []
for i in range(len(predictionyears)):
    all_est.append((cc, predictionyears[i], unlogit_est['med'][i],
                    unlogit_est['lower'][i], unlogit_est['upper'][i]))
all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('year', '<f8'),
                             ('med', '<f8'), ('lower', '<f8'),
                             ('upper', '<f8')])
pl.rec2csv(all_est, 'gpr_' + cc + '_' + sex + '_' + age + '.txt')

# save the sims
all_sim = []
for i in range(len(predictionyears)):
    for s in range(draws):
        all_sim.append((cc, predictionyears[i], s, mort_draws[s][i]))

all_sim = pl.array(all_sim, [('ihme_loc_id', '|S32'), ('year', '<f8'),
                             ('sim', '<f8'), ('mort', '<f8')])
pl.rec2csv(all_sim, 'gpr_' + cc + '_' + sex + '_' + age + '_sim.txt')
Пример #27
0
                total_var = var + logit_est['std'][pred_index]**2
                coverage = int(
                    (logit_est['med'][pred_index] -
                     1.96 * pl.sqrt(total_var)) < mort <
                    (logit_est['med'][pred_index] + 1.96 * pl.sqrt(total_var)))
                all_err.append((rr, cc, ho, scale, amp2x, mse * amp2x, year,
                                mort, re, coverage))

## write files
if os.name == 'posix':
    os.chdir('FILEPATH')
else:
    os.chdir('FILEPATH')

all_est = pl.array(all_est, [('gbd_region', '|S64'), ('iso3', '|S32'),
                             ('ho', '<f8'), ('scale', '<f8'), ('amp2x', '<f8'),
                             ('amp2', '<f8'), ('year', '<f8'), ('mort', '<f8'),
                             ('std', '<f8')])
pl.rec2csv(all_est, 'gpr_%s_%i.txt' % (cc, ho))

if os.name == 'posix':
    os.chdir('FILEPATH')
else:
    os.chdir('FILEPATH')

all_err = pl.array(all_err, [('gbd_region', '|S64'), ('iso3', '|S32'),
                             ('ho', '<f8'), ('scale', '<f8'), ('amp2x', '<f8'),
                             ('amp2', '<f8'), ('year', '<f8'), ('mort', '<f8'),
                             ('re', '<f8'), ('coverage', '<f8')])
pl.rec2csv(all_err, 'loss_%s_%i.txt' % (cc, ho))
Пример #28
0
coldict['lat'] = np.concatenate((duffy_data.lat, vivax_data.lat))
coldict['n'] = np.concatenate((duffy_data.n, vivax_data.pos + vivax_data.neg))
coldict['vivax_pos'] = np.concatenate((duffy_nan, vivax_data.pos))
coldict['vivax_neg'] = np.concatenate((duffy_nan, vivax_data.neg))
coldict['datatype'] = np.concatenate(
    (duffy_data.datatype, np.repeat('vivax', n_vivax)))

for colname in vivaxcols:
    coldict[colname] = np.concatenate((duffy_nan, vivax_data[colname]))

for colname in duffycols:
    coldict[colname] = np.concatenate((duffy_data[colname], vivax_nan))

allcols = coldict.keys()
combined_data = np.rec.fromarrays([coldict[col] for col in allcols],
                                  names=allcols)


# FIXME: Do the Sahel instead.
def box_data(data, llcrnrlon, llcrnrlat, urcrnrlon, urcrnrlat):
    indicator = (data.lon > llcrnrlon) * (data.lon < urcrnrlon) * (
        data.lat > llcrnrlat) * (data.lat < urcrnrlat)
    return data[np.where(indicator)]


# Write out
# warnings.warn('Boxing')
# combined_data = combined_data[np.where((combined_data.lon>-19)*(combined_data.lon<54)*(combined_data.lat>0))]
# combined_data = box_data(combined_data, 31.5, 11.5, 64, 32)
rec2csv(combined_data, combined_datafile)
Пример #29
0
        for i in range(len(predictionyears)):
            all_est.append((cc, ss, predictionyears[i], unlog_est['med'][i],
                            unlog_est['lower'][i], unlog_est['upper'][i]))

        all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('sex', '|S32'),
                                     ('year', '<f8'), ('mort_med', '<f8'),
                                     ('mort_lower', '<f8'),
                                     ('mort_upper', '<f8')])

        ## no need to save the summary if we're doing the HIV draws version
        if (hiv_uncert == int(1)):
            pass
        else:
            est_file = "FILEPATH"
            pl.rec2csv(all_est, est_file)

        # save the sims
        all_sim = []

        for i in range(len(predictionyears)):
            for s in range(dr):
                if (transform == 'log10'):
                    all_sim.append((cc, ss, predictionyears[i], s,
                                    10**d[s][i]))  # log base 10 space
                elif (transform == 'ln'):
                    all_sim.append((cc, ss, predictionyears[i], s,
                                    math.e**d[s][i]))  # natural log space
                elif (transform == 'logit'):
                    all_sim.append((cc, ss, predictionyears[i], s,
                                    ((math.e**d[s][i]) /
# collapse across draws
# note: space transformations need to be performed at the draw level
# not actually doing any transformations here, we'll do after
logit_est = gpr.collapse_sims(mort_draws)
unlogit_est = gpr.collapse_sims(mort_draws)

# save the predictions
all_est = []
for i in range(len(predictionyears)):
    all_est.append((cc, predictionyears[i], unlogit_est['med'][i],
                    unlogit_est['lower'][i], unlogit_est['upper'][i]))
all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('year', '<f8'),
                             ('med', '<f8'), ('lower', '<f8'),
                             ('upper', '<f8')])

output_file = "FILEPATH"
pl.rec2csv(all_est, output_file)

# save the sims
all_sim = []
for i in range(len(predictionyears)):
    for s in range(draws):
        all_sim.append((cc, predictionyears[i], s, mort_draws[s][i]))

all_sim = pl.array(all_sim, [('ihme_loc_id', '|S32'), ('year', '<f8'),
                             ('sim', '<f8'), ('mort', '<f8')])

output_file = "FILEPATH"
pl.rec2csv(all_sim, output_file)
Пример #31
0
def combine_output(J, T, model, dir, reps, save=False):
    """
    Combine output on absolute error, relative error, csmf_accuracy, and coverage from from
    multiple runs of validate_once. Either saves the output to the disk, or returns arays
    for each. 
    """

    cause = pl.zeros(J * T, dtype='f').view(pl.recarray)
    time = pl.zeros(J * T, dtype='f').view(pl.recarray)
    abs_err = pl.zeros(J * T, dtype='f').view(pl.recarray)
    rel_err = pl.zeros(J * T, dtype='f').view(pl.recarray)
    coverage = pl.zeros(J * T, dtype='f').view(pl.recarray)
    csmf_accuracy = pl.zeros(J * T, dtype='f').view(pl.recarray)

    for i in range(reps):
        metrics = pl.csv2rec('%s/metrics_%s_%i.csv' % (dir, model, i))
        cause = pl.vstack((cause, metrics.cause))
        time = pl.vstack((time, metrics.time))
        abs_err = pl.vstack((abs_err, metrics.abs_err))
        rel_err = pl.vstack((rel_err, metrics.rel_err))
        coverage = pl.vstack((coverage, metrics.coverage))
        csmf_accuracy = pl.vstack((csmf_accuracy, metrics.csmf_accuracy))

    cause = cause[1:, ]
    time = time[1:, ]
    abs_err = abs_err[1:, ]
    rel_err = rel_err[1:, ]
    coverage = coverage[1:, ]
    csmf_accuracy = csmf_accuracy[1:, ]

    mean_abs_err = abs_err.mean(0)
    median_abs_err = pl.median(abs_err, 0)
    mean_rel_err = rel_err.mean(0)
    median_rel_err = pl.median(rel_err, 0)
    mean_csmf_accuracy = csmf_accuracy.mean(0)
    median_csmf_accuracy = pl.median(csmf_accuracy, 0)
    mean_coverage_bycause = coverage.mean(0)
    mean_coverage = coverage.reshape(reps, T, J).mean(0).mean(1)
    percent_total_coverage = (coverage.reshape(reps, T, J).sum(2) == 3).mean(0)
    mean_coverage = pl.array([[i for j in range(J)]
                              for i in mean_coverage]).ravel()
    percent_total_coverage = pl.array([[i for j in range(J)]
                                       for i in percent_total_coverage
                                       ]).ravel()

    models = pl.array([[model for j in range(J)] for i in range(T)]).ravel()
    true_cf = metrics.true_cf
    true_std = metrics.true_std
    std_bias = metrics.std_bias

    all = pl.np.core.records.fromarrays(
        [
            models, cause[0], time[0], true_cf, true_std, std_bias,
            mean_abs_err, median_abs_err, mean_rel_err, median_rel_err,
            mean_csmf_accuracy, median_csmf_accuracy, mean_coverage_bycause,
            mean_coverage, percent_total_coverage
        ],
        names=[
            'model', 'cause', 'time', 'true_cf', 'true_std', 'std_bias',
            'mean_abs_err', 'median_abs_err', 'mean_rel_err', 'median_rel_err',
            'mean_csmf_accuracy', 'median_csmf_accuracy',
            'mean_covearge_bycause', 'mean_coverage', 'percent_total_coverage'
        ])

    if save:
        pl.rec2csv(all, '%s/%s_summary.csv' % (dir, model))
    else:
        return all
Пример #32
0
    else:
        indexer[axis] = slice(i, i + 2)
        j = i + 1
        weights = np.array([(j - index), (index - i)], float)
        wshape = [1] * sorted.ndim
        wshape[axis] = 2
        weights.shape = wshape
        sumval = weights.sum()
    return np.add.reduce(sorted[indexer] * weights, axis=axis, out=out) / sumval


# save basic estimates
model_estimates = model.trace("estimate")[:]
mean_estimate = model_estimates.mean(axis=0)
lower_estimate = percentile(model_estimates, 2.5, axis=0)
upper_estimate = percentile(model_estimates, 97.5, axis=0)
output = pl.rec_append_fields(
    rec=data, names=["mean", "lower", "upper"], arrs=[mean_estimate, lower_estimate, upper_estimate]
)
pl.rec2csv(
    output, proj_dir + "outputs/model results/spatial smoothing/" + mod_name + "_" + str(sex) + "_" + age + ".csv"
)

# save draws
draws = pl.rec_append_fields(
    rec=data, names=["draw_" + str(i + 1) for i in range(500)], arrs=[model.trace("estimate")[i] for i in range(500)]
)
pl.rec2csv(
    draws, proj_dir + "outputs/model results/spatial smoothing/" + mod_name + "_draws_" + str(sex) + "_" + age + ".csv"
)
Пример #33
0
# collapse across draws
# note: space transformations need to be performed at the draw level
logit_est = gpr.collapse_sims(mort_draws)
unlogit_est = gpr.collapse_sims(gpr.inv_logit(mort_draws))

if hivsims == 0:
    os.chdir('FILEPATH')
    all_est = []
    for i in range(len(predictionyears)):
        all_est.append((cc, predictionyears[i], unlogit_est['med'][i],
                        unlogit_est['lower'][i], unlogit_est['upper'][i]))
    all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('year', '<f8'),
                                 ('med', '<f8'), ('lower', '<f8'),
                                 ('upper', '<f8')])
    pl.rec2csv(all_est, 'gpr_%s.txt' % cc)

# save the sims
all_sim = []
for i in range(len(predictionyears)):
    for s in range(draws):
        all_sim.append(
            (cc, predictionyears[i], s, gpr.inv_logit(mort_draws[s][i])))

all_sim = pl.array(all_sim, [('ihme_loc_id', '|S32'), ('year', '<f8'),
                             ('sim', '<f8'), ('mort', '<f8')])

if hivsims == 1:
    os.chdir('FILEPATH')
    pl.rec2csv(all_sim, 'gpr_%s_%s_sim.txt' % (cc, rnum))
else:
Пример #34
0
full_dir = '%s/v02_prep_%s' % (indir, iso3)

# get cause list 
causes = list(set([file.split('+')[1] for file in os.listdir(full_dir) if re.search(age, file)]))
causes.remove('HIV') # temporary until Miriam fixes the HIV files 

# gather data and fit model 
cf = data.get_cod_data(full_dir, causes, age, iso3, sex)
m, pi = models.fit_latent_simplex(cf) 

# calculate summary measures
N, T, J = pi.shape
mean = pi.mean(0)
lower = pl.array([[st.mquantiles(pi[:,t,j], 0.025)[0] for j in range(J)] for t in range(T)])
upper = pl.array([[st.mquantiles(pi[:,t,j], 0.975)[0] for j in range(J)] for t in range(T)])

# format summary and save
output = pl.np.core.records.fromarrays(mean.T, names=['%s_mean' % c for c in causes])
output = pl.rec_append_fields(output, ['%s_lower' % c for c in causes], lower.T)
output = pl.rec_append_fields(output, ['%s_upper' % c for c in causes], upper.T)
pl.rec2csv(output, '%s/%s+%s+%s+summary.csv' % (outdir, iso3, age, sex))

# format all sims and save 
pi.shape = (N*T, J)
years = pl.array([t for s in range(N) for t in range(1980, 2012)])
sim = pl.array([s for s in range(N) for t in range(1980, 2012)])
output = pl.np.core.records.fromarrays(pi.T, names=causes)
output = pl.rec_append_fields(output, 'year', years)
output = pl.rec_append_fields(output, 'sim', sim)
pl.rec2csv(output, '%s/%s+%s+%s.csv' % (outdir, iso3, age, sex))
Пример #35
0
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, iters):
    # load in the data
    all_data = csv2rec(infile, use_mrecords=False)
    for m in range(number_submodels):
        all_data = np.delete(
            all_data,
            np.where(np.isnan(all_data['spacetime_' + str(m + 1)]))[0],
            axis=0)

    # Investigate error thrown for HKG, MAC, and SGP... they don't have data, but don't know why this is breaking line 62
    all_data = all_data[all_data['iso3'] != "HKG"]
    all_data = all_data[all_data['iso3'] != "MAC"]
    all_data = all_data[all_data['iso3'] != "SGP"]

    # find the list of years for which we need to predict
    year_list = np.unique(all_data.year)

    # find the list of country/age groups
    country_age = np.array([all_data.iso3[i] for i in range(len(all_data))])
    country_age_list = np.repeat(np.unique(country_age), len(year_list))

    # make empty arrays in which to store the results
    draws = [
        np.empty(len(country_age_list), 'float')
        for i in range(iters * number_submodels * 2)
    ]
    iso3 = np.empty(len(country_age_list), '|S3')
    # age_group = np.empty(len(country_age_list), 'int')
    year = np.empty(len(country_age_list), 'int')

    # loop through country/age groups
    for ca in np.unique(country_age_list):

        print('GPRing ' + ca)

        # subset the data for this particular country/age
        ca_data = all_data[country_age == ca]

        # subset just the observed data
        if ca_data['lt_prev'].dtype != '|O8':
            ca_observed = ca_data[(np.isnan(ca_data['lt_prev']) == 0)]
            if len(ca_observed) > 1:
                has_data = True
            else:
                has_data = False
        else:
            has_data = False

        # loop through each submodel
        for m in range(number_submodels):

            # identify the dependent variable for this model
            dv = dv_list[m]

            # loop through spacetime/linear
            for x, t in enumerate(['spacetime']):

                # make a list of the spacetime predictions
                ca_prior = np.array([
                    np.mean(ca_data[t + '_' + str(m + 1)][ca_data.year == y])
                    for y in year_list
                ])

                # find the amplitude for this country/age
                amplitude = np.mean(ca_data[t + '_amplitude_' + str(m + 1)])

                # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR
                def mean_function(x):
                    return np.interp(x, year_list, ca_prior)

                # setup the covariance function
                M = gp.Mean(mean_function)
                C = gp.Covariance(eval_fun=gp.matern.euclidean,
                                  diff_degree=2,
                                  amp=amplitude,
                                  scale=scale)

                # observe the data if there is any
                if has_data:
                    gp.observe(M=M,
                               C=C,
                               obs_mesh=ca_observed.year,
                               obs_V=ca_observed[t + '_data_variance_' +
                                                 str(m + 1)],
                               obs_vals=ca_observed['lt_prev'])

                # draw realizations from the data
                realizations = [gp.Realization(M, C) for i in range(iters)]

                # save the data for this country/age into the results array
                iso3[country_age_list == ca] = ca[0:3]
                # age_group[country_age_list==ca] = ca[4:]
                year[country_age_list == ca] = year_list.T
                for i in range(iters):
                    draws[((2 * m + x) * iters) + i][
                        country_age_list == ca] = realizations[i](year_list)

    # save the results
    print('Saving GPR results')
    names = ['iso3', 'age_group', 'year']
    results = np.core.records.fromarrays([iso3, year], names=names)
    for m in range(number_submodels):
        for x, t in enumerate(['spacetime']):
            for i in range(iters):
                results = recfunctions.append_fields(
                    results, 'gpr_' + str(m + 1) + '_' + t + '_d' + str(i + 1),
                    draws[((2 * m + x) * iters) + i])
            results = recfunctions.append_fields(
                results, 'gpr_' + str(m + 1) + '_' + t + '_mean',
                np.mean(draws[((2 * m + x) * iters):((2 * m + x + 1) * iters)],
                        axis=0))
        rec2csv(results, outfile)
Пример #36
0
    lon_ind = np.argmin(np.abs(np.subtract.outer(lon_old, lon_new)), axis=0)
    lat_ind = np.argmin(np.abs(np.subtract.outer(lat_old, lat_new)), axis=0)
    out = lat_new*0
    for i in xrange(len(lon_new)):
        lai, loi = lat_ind[i], lon_ind[i]
        if data.mask[lai, loi]:
            for d in xrange(10):
                if True-np.all(data.mask[lai-d:lai+d,loi-d:loi+d]):
                    out[i] = mode(data.data[lai-d:lai+d,loi-d:loi+d][np.where(True-data.mask[lai-d:lai+d,loi-d:loi+d])])
                    break
        else:
            out[i] = data[lai,loi]
    if np.any(np.isnan(out)):
        raise ValueError
    return out

for fname in map(lambda n: n+'.hdf5', covariate_names):
    print 'Evaluating %s'%fname
    colname = os.path.splitext(fname)[0]
    hf = tb.openFile(os.path.join(covariate_path,fname))
    
    cols[colname] = map_utils.interp_geodata(hf.root.lon[:],hf.root.lat[:],hf.root.data[:],cols['lon'],cols['lat'],hf.root.mask[:],order=0,nan_handler=nan_callback)
    if np.any(np.isnan(cols[colname])):
        raise ValueError
    
    hf.close()
    
keys = cols.keys()
data_out = np.rec.fromarrays([cols[k] for k in keys], names=keys)
rec2csv(data_out, os.path.splitext(os.path.basename(sys.argv[1]))[0]+'_with_covariates.csv')
Пример #37
0
        weights = np.array(1)
        sumval = 1.0
    else:
        indexer[axis] = slice(i, i+2)
        j = i + 1
        weights = np.array([(j - index), (index - i)],float)
        wshape = [1]*sorted.ndim
        wshape[axis] = 2
        weights.shape = wshape
        sumval = weights.sum()
    return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval



# save basic estimates
model_estimates =   model.trace('estimate')[:]
mean_estimate =     model_estimates.mean(axis=0)
lower_estimate =    percentile(model_estimates, 2.5, axis=0)
upper_estimate =    percentile(model_estimates, 97.5, axis=0)
output =            pl.rec_append_fields(  rec =   data, 
                        names = ['mean', 'lower', 'upper'], 
                        arrs =  [mean_estimate, lower_estimate, upper_estimate])
pl.rec2csv(output, proj_dir + 'outputs/model results/spatial smoothing/spatial_intercept_' + str(sex) + '_' + age + '.csv')

# save draws
draws =     pl.rec_append_fields(
                    rec =   data,
                    names = ['draw_' + str(i+1) for i in range(100)],
                    arrs =  [model.trace('estimate')[i] for i in range(100)])
pl.rec2csv(draws, proj_dir + 'outputs/model results/spatial smoothing/spatial_intercept_draws_' + str(sex) + '_' + age + '.csv')
Пример #38
0
	
## find mean and standard error, drawing from M and C
draws = 1000
mort_draws = np.zeros((draws, len(predictionyears)))
gpr_seeds = [x+123456 for x in range(1,1001)]
for draw in range(draws):
	np.random.seed(gpr_seeds[draw])
	mort_draws[draw,:] = Realization(M, C)(predictionyears)


logit_est = gpr.collapse_sims(mort_draws)
unlogit_est = gpr.collapse_sims(mort_draws)

os.chdir('FILEPATH')
all_est = []
for i in range(len(predictionyears)):
	all_est.append((cc, predictionyears[i], unlogit_est['med'][i], unlogit_est['lower'][i], unlogit_est['upper'][i]))
all_est = pl.array(all_est, [('ihme_loc_id', '|S32'), ('year', '<f8'), ('med', '<f8'), ('lower', '<f8'), ('upper', '<f8')])
pl.rec2csv(all_est, 'gpr_%s.txt' %cc)

# save the sims 
all_sim = []
for i in range(len(predictionyears)):
	for s in range(draws):
		all_sim.append((cc, predictionyears[i], s, mort_draws[s][i]))


all_sim = pl.array(all_sim, [('ihme_loc_id', '|S32'), ('year', '<f8'), ('sim', '<f8'), ('mort', '<f8')])
pl.rec2csv(all_sim, 'gpr_%s_sim.txt' %cc)

Пример #39
0
C = pm.gp.FullRankCovariance(my_st, amp=1, scale=1, inc=np.pi/4, ecc=.3,st=.1, sd=.5, tlc=.2, sf = .1)

dm = np.vstack((lon,lat,t)).T

C_eval = C(dm,dm)

f = pm.rmv_normal_cov(np.sum([cv[name]*vals[name] for name in names],axis=0), C_eval) + np.random.normal(size=n_data+n_pred)*np.sqrt(V)
p = pm.flib.invlogit(f)
ns = 100
pos = pm.rbinomial(ns, p)
neg = ns - pos

print p

ra_data = np.rec.fromarrays((pos[:n_data], neg[:n_data], lon[:n_data], lat[:n_data]) + tuple([cv[name][:n_data] for name in names]), names=['pos','neg','lon','lat']+names)
pl.rec2csv(ra_data,'test_data.csv')

ra_pred = np.rec.fromarrays((pos[n_data:], neg[n_data:], lon[n_data:], lat[n_data:]) + tuple([cv[name][n_data:] for name in names]), names=['pos','neg','lon','lat']+names)
pl.rec2csv(ra_pred,'test_pred.csv')

os.system('infer cov_test test_db test_data.csv -t 10 -n 8 -i 100000')
# os.system('cov-test-predict test test_pred.csv 1000 100')
# 
# # ra_data = pl.csv2rec('test_data.csv')
# # ra_pred = pl.csv2rec('test_pred.csv')
# samps = np.fromfile('test_samps.csv',sep=',').reshape((n_pred,-1))
# 
# pos_pred = pos[n_data:]
# neg_pred = neg[n_data:]
# p_pred = (pos_pred+1.)/(pos_pred+neg_pred+2.)
# 
Пример #40
0
coldict = {}
coldict["t"] = np.concatenate((duffy_nan, (tstart + tend) / 2.0))
coldict["lon"] = np.concatenate((duffy_data.lon, vivax_data.lon))
coldict["lat"] = np.concatenate((duffy_data.lat, vivax_data.lat))
coldict["n"] = np.concatenate((duffy_data.n, vivax_data.pos + vivax_data.neg))
coldict["vivax_pos"] = np.concatenate((duffy_nan, vivax_data.pos))
coldict["vivax_neg"] = np.concatenate((duffy_nan, vivax_data.neg))
coldict["datatype"] = np.concatenate((duffy_data.datatype, np.repeat("vivax", n_vivax)))

for colname in vivaxcols:
    coldict[colname] = np.concatenate((duffy_nan, vivax_data[colname]))

for colname in duffycols:
    coldict[colname] = np.concatenate((duffy_data[colname], vivax_nan))

allcols = coldict.keys()
combined_data = np.rec.fromarrays([coldict[col] for col in allcols], names=allcols)

# FIXME: Do the Sahel instead.
def box_data(data, llcrnrlon, llcrnrlat, urcrnrlon, urcrnrlat):
    indicator = (data.lon > llcrnrlon) * (data.lon < urcrnrlon) * (data.lat > llcrnrlat) * (data.lat < urcrnrlat)
    return data[np.where(indicator)]


# Write out
# warnings.warn('Boxing')
# combined_data = combined_data[np.where((combined_data.lon>-19)*(combined_data.lon<54)*(combined_data.lat>0))]
# combined_data = box_data(combined_data, 31.5, 11.5, 64, 32)
rec2csv(combined_data, combined_datafile)
Пример #41
0
def compile_all_results(scenarios, dir='../data'):
    """
    Compiles the results across multiple scenarios produced by running run_on_cluster on each 
    one into a single sv file. The specified directory must be where where the results of 
    running run_on_cluster for each scenario are stored (each is a sub-directory named v0, v1, etc.)
    and is also where the output from this function will be saved.    
    """

    models = []
    causes = []
    time = []
    true_cf = []
    true_std = []
    std_bias = []
    mean_abs_err = []
    median_abs_err = []
    mean_rel_err = []
    median_rel_err = []
    mean_csmf_accuracy = []
    median_csmf_accuracy = []
    mean_coverage_bycause = []
    mean_coverage = []
    percent_total_coverage = []
    scenario = []

    for i in range(scenarios):
        for j in ['bad_model', 'latent_simplex']:
            read = csv.reader(open('%s/v%s/%s_summary.csv' % (dir, i, j)))
            read.next()
            for row in read:
                models.append(row[0])
                causes.append(row[1])
                time.append(row[2])
                true_cf.append(row[3])
                true_std.append(row[4])
                std_bias.append(row[5])
                mean_abs_err.append(row[6])
                median_abs_err.append(row[7])
                mean_rel_err.append(row[8])
                median_rel_err.append(row[9])
                mean_csmf_accuracy.append(row[10])
                median_csmf_accuracy.append(row[11])
                mean_coverage_bycause.append(row[12])
                mean_coverage.append(row[13])
                percent_total_coverage.append(row[14])
                scenario.append(i)

    all = pl.np.core.records.fromarrays(
        [
            scenario, models, time, true_cf, true_std, causes, mean_abs_err,
            median_abs_err, mean_rel_err, median_rel_err, mean_csmf_accuracy,
            median_csmf_accuracy, mean_coverage_bycause, mean_coverage,
            percent_total_coverage
        ],
        names=[
            'scenario', 'model', 'time', 'true_cf', 'true_std', 'cause',
            'mean_abs_err', 'median_abs_err', 'mean_rel_err', 'median_rel_err',
            'mean_csmf_accuracy', 'median_csmf_accuracy',
            'mean_covearge_bycause', 'mean_coverage', 'percent_total_coverage'
        ])
    pl.rec2csv(all, fname='%s/all_summary_metrics.csv' % (dir))
Пример #42
0
                                mse=mse)
else:  # data model
    [M, C] = gpr.gpmodel(ihme_loc_id, region_name, data_year, data_mort,
                         data_var, data_category, prior_year, prior_mort, mse,
                         best_scale, best_amp2x, predictionyears)

## find mean and standard error, drawing from M and C
draws = 1000
mort_draws = np.zeros((draws, len(predictionyears)))
gpr_seeds = [x + 123456 for x in range(1, 1001)]
for draw in range(draws):
    np.random.seed(gpr_seeds[draw])
    mort_draws[draw, :] = Realization(M, C)(predictionyears)

# collapse across draws
# note: space transformations need to be performed at the draw level
logit_est = gpr.collapse_sims(mort_draws)
unlogit_est = gpr.collapse_sims(gpr.inv_logit(mort_draws))

# save the sims
all_sim = []
for i in range(len(predictionyears)):
    for s in range(draws):
        all_sim.append((ihme_loc_id, predictionyears[i], s,
                        gpr.inv_logit(mort_draws[s][i])))

all_sim = pl.array(all_sim, [('ihme_loc_id', '|S32'), ('year', '<f8'),
                             ('sim', '<f8'), ('mort', '<f8')])

pl.rec2csv(all_sim, "FILEPATH")
Пример #43
0
        weights.shape = wshape
        sumval = weights.sum()
    return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval

import time
print 'Finished at %s' % time.ctime()

# save basic predictions
predictions =       model.trace('predicted')[:]
mean_prediction =   predictions.mean(axis=0)
lower_prediction =  percentile(predictions, 2.5, axis=0)
upper_prediction =  percentile(predictions, 97.5, axis=0)
output =            pl.rec_append_fields(  rec =   data, 
                        names = ['mean', 'lower', 'upper'], 
                        arrs =  [mean_prediction, lower_prediction, upper_prediction])
pl.rec2csv(output, proj_dir + 'outputs/model results/epi transition by state/all_cause_males.csv')

# plot surfaces
from    mpl_toolkits.mplot3d    import axes3d
import  matplotlib.pyplot       as plt
from    matplotlib.backends.backend_pdf import PdfPages
pp =    PdfPages(proj_dir + 'outputs/model results/epi transition by state/surfaces.pdf')
fig =   plt.figure()
ax =    fig.gca(projection='3d')
X,Y =   np.meshgrid(years, ages)
Z =     model.trace('alpha_surf')[:].mean(axis=0)
ax.plot_wireframe(X, Y, Z, color='#315B7E')
ax.set_title('National')
pp.savefig()
for g in g_list:
    fig =   plt.figure()
Пример #44
0
        wshape[axis] = 2
        weights.shape = wshape
        sumval = weights.sum()
    return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval



# save basic estimates
model_estimates =   model.trace('estimate')[:]
mean_estimate =     model_estimates.mean(axis=0)
lower_estimate =    percentile(model_estimates, 2.5, axis=0)
upper_estimate =    percentile(model_estimates, 97.5, axis=0)
output =            pl.rec_append_fields(  rec =   data, 
                        names = ['mean', 'lower', 'upper'], 
                        arrs =  [mean_estimate, lower_estimate, upper_estimate])
pl.rec2csv(output, proj_dir + 'outputs/model results/random effects plus flex time/srw_plus_interaction_results.csv')


'''
### plot diagnostics
# setup plotting
#import matplotlib.pyplot as pp
#pp.switch_backend('acc')
plot_me = [mu_si, mu_ss, mu_ci, mu_cs, sigma_si, sigma_ss, sigma_ci, sigma_cs, state_intercepts, state_slopes, cause_intercepts, cause_slopes]

# plot traces
os.chdir(proj_dir + '/outputs/model results/simple random effects by state/mcmc plots/traces/')
for p in plot_me:
    mc.Matplot.plot(p, suffix='_trace')
    if len(p.shape) == 0:
        plt.close()
        wshape = [1] * sorted.ndim
        wshape[axis] = 2
        weights.shape = wshape
        sumval = weights.sum()
    return np.add.reduce(sorted[indexer] * weights, axis=axis, out=out) / sumval


# save basic estimates
model_estimates = model.trace("estimate")[:]
mean_estimate = model_estimates.mean(axis=0)
lower_estimate = percentile(model_estimates, 2.5, axis=0)
upper_estimate = percentile(model_estimates, 97.5, axis=0)
output = pl.rec_append_fields(
    rec=data, names=["mean", "lower", "upper"], arrs=[mean_estimate, lower_estimate, upper_estimate]
)
pl.rec2csv(output, proj_dir + "outputs/model results/simple random effects by state/pymc_results.csv")


### plot diagnostics
# setup plotting
# import matplotlib.pyplot as pp
# pp.switch_backend('acc')
plot_me = [
    mu_si,
    mu_ss,
    mu_ci,
    mu_cs,
    sigma_si,
    sigma_ss,
    sigma_ci,
    sigma_cs,
Пример #46
0
def fit_GPR(infile, outfile, dv_list, scale, number_submodels, test):
    # load in the data
    all_data = csv2rec(infile, use_mrecords=False)
    for m in range(number_submodels):
        if all_data['spacetime_' + str(m+1)].dtype == 'float64':
            all_data = np.delete(all_data, np.where(np.isnan(all_data['spacetime_' + str(m+1)]))[0], axis=0)

    # find the list of years for which we need to predict
    year_list = np.unique(all_data.year)

    # find the list of country/age groups
    country_age = np.array([str(all_data.iso3[i]) + '_' + str(all_data.age_group[i]) for i in range(len(all_data))])
    country_age_list = np.repeat(np.unique(country_age), len(year_list))

    # make empty arrays in which to store the results
    draws = [np.empty(len(country_age_list), 'float') for i in range(number_submodels)]
    iso3 = np.empty(len(country_age_list), '|S3')
    age_group = np.empty(len(country_age_list), 'int')
    year = np.empty(len(country_age_list), 'int')

    # loop through country/age groups
    for ca in np.unique(country_age_list):
        print('GPRing ' + ca)

        # subset the data for this particular country/age
        ca_data = all_data[country_age==ca]

        # subset just the observed data
        if ca_data['lt_cf'].dtype != '|O8':
            ca_observed = ca_data[(np.isnan(ca_data['lt_cf'])==0) & (ca_data['test_' + test]==0)]
            if len(ca_observed) > 1:
                has_data = True
            else:
                has_data = False
        else:
            has_data = False

        # loop through each submodel
        for m in range(number_submodels):

            # skip models with no spacetime results
            if all_data['spacetime_' + str(m+1)].dtype != 'float64':
                draws[m][country_age_list==ca] = np.NaN
                continue

            # identify the dependent variable for this model
            dv = dv_list[m]

            # make a list of the spacetime predictions
            ca_prior = np.array([np.mean(ca_data['spacetime_' + str(m+1)][ca_data.year==y]) for y in year_list])

            # find the amplitude for this country/age
            amplitude = np.mean(ca_data['spacetime_amplitude_' + str(m+1)])

            # make a linear interpolation of the spatio-temporal predictions to use as the mean function for GPR
            def mean_function(x) :
                return np.interp(x, year_list, ca_prior)

            # setup the covariance function
            M = gp.Mean(mean_function)
            C = gp.Covariance(eval_fun=gp.matern.euclidean, diff_degree=2, amp=amplitude, scale=scale)

            # observe the data if there is any
            if has_data:
                gp.observe(M=M, C=C, obs_mesh=ca_observed.year, obs_V=ca_observed['spacetime_data_variance_' + str(m+1)], obs_vals=ca_observed[dv])

            # save the data for this country/age into the results array
            iso3[country_age_list==ca] = ca[0:3]
            age_group[country_age_list==ca] = ca[4:]
            year[country_age_list==ca] = year_list.T
            draws[m][country_age_list==ca] = M(year_list)

    # save the results
    print('Saving GPR results')
    names = ['iso3','age_group','year']
    results = np.core.records.fromarrays([iso3,age_group,year], names=names)
    for m in range(number_submodels):
        results = recfunctions.append_fields(results, 'gpr_' + str(m+1) + '_spacetime_mean', draws[m])
    rec2csv(results, outfile)
Пример #47
0
def rec2csv_2d(Y, fname):
    """
    write a 2-dimensional recarray to a csv file
    """
    
    pl.rec2csv(pl.np.core.records.fromarrays(Y.T), fname)
Пример #48
0
        wshape[axis] = 2
        weights.shape = wshape
        sumval = weights.sum()
    return np.add.reduce(sorted[indexer]*weights, axis=axis, out=out)/sumval



# save basic estimates
model_estimates =   model.trace('estimate')[:]
mean_estimate =     model_estimates.mean(axis=0)
lower_estimate =    percentile(model_estimates, 2.5, axis=0)
upper_estimate =    percentile(model_estimates, 97.5, axis=0)
output =            pl.rec_append_fields(  rec =   data, 
                        names = ['mean', 'lower', 'upper'], 
                        arrs =  [mean_estimate, lower_estimate, upper_estimate])
pl.rec2csv(output, proj_dir + 'outputs/model results/random effects plus flex time/pymc_results.csv')


'''
### plot diagnostics
# setup plotting
#import matplotlib.pyplot as pp
#pp.switch_backend('acc')
plot_me = [mu_si, mu_ss, mu_ci, mu_cs, sigma_si, sigma_ss, sigma_ci, sigma_cs, state_intercepts, state_slopes, cause_intercepts, cause_slopes]

# plot traces
os.chdir(proj_dir + '/outputs/model results/simple random effects by state/mcmc plots/traces/')
for p in plot_me:
    mc.Matplot.plot(p, suffix='_trace')
    if len(p.shape) == 0:
        plt.close()
					if (transform == 'log10'):
						re = (10**mort - unlog_est['med'][pred_index])/(10**mort) # log base 10
						#print('1') 
					elif (transform == 'ln'):
						re = (math.e**mort - unlog_est['med'][pred_index])/(math.e**mort) # natural log
						#print('2')
					elif (transform == 'logit'):
						re = (((math.e**mort)/(1+(math.e**mort))) - unlog_est['med'][pred_index])/((math.e**mort)/(1+(math.e**mort))) # logit
						#print('3')
					elif (transform == 'logit10'):
						re = (((10**mort)/(1+(10**mort))) - unlog_est['med'][pred_index])/((10**mort)/(1+(10**mort))) # logit
						#print('4')
					total_var = stderr**2 + log_est['std'][pred_index]**2 # This evaluates coverage as if any part of uncertainty of data and estimates overlap
					# total_var = log_est['std'][pred_index]**2 # This calculates coverage based only on the uncertainty of the estimate
					coverage = int((log_est['med'][pred_index] - 1.96*pl.sqrt(total_var)) < mort < (log_est['med'][pred_index] + 1.96*pl.sqrt(total_var)))
					all_err.append((rr, cc, ss, ho, scale, amp2x, lam, zeta, mse*amp2x, year, mort, re, coverage))
		
		## write files 
		os.chdir('/strPath')
		all_est = pl.array(all_est, [('region_name', '|S64'), ('ihme_loc_id', '|S32'), ('sex', '|S32'), ('ho', '<f8'), 
									 ('scale', '<f8'), ('amp2x', '<f8'), ('lambda', '<f8'), ('zeta', '<f8'), ('amp2', '<f8'), ('year', '<f8'), 
									 ('mort', '<f8'), ('std', '<f8')])
		pl.rec2csv(all_est, 'gpr_%s_%s_%i_%s_%s.txt' %(cc, ss, ho, lam, zeta))
		
		os.chdir('strPath')
		all_err = pl.array(all_err, [('region_name', '|S64'), ('ihme_loc_id', '|S32'), ('sex', '|S32'), ('ho', '<f8'), 
									 ('scale', '<f8'), ('amp2x', '<f8'),  ('lambda', '<f8'), ('zeta', '<f8'),('amp2', '<f8'), ('year', '<f8'), 
									 ('mort', '<f8'), ('re', '<f8'), ('coverage', '<f8')])
		pl.rec2csv(all_err, 'loss_%s_%s_%i_%s_%s.txt' %(cc, ss, ho, lam, zeta))