예제 #1
0
    def predict(self, times, predict_space, predict_group):
        beta_predictions = None
        p_predictions = None

        if self.beta_weight > 0:
            loose_beta_predictions = self.loose_beta_model.predict(
                t=times,
                group_name=predict_group,
                prediction_functional_form=predict_space)
            tight_beta_predictions = self.tight_beta_model.predict(
                t=times,
                group_name=predict_group,
                prediction_functional_form=predict_space)
            beta_predictions = convex_combination(t=times,
                                                  pred1=tight_beta_predictions,
                                                  pred2=loose_beta_predictions,
                                                  pred_fun=predict_space,
                                                  start_day=self.blend_start_t,
                                                  end_day=self.blend_end_t)
        if self.p_weight > 0:
            loose_p_predictions = self.loose_p_model.predict(
                t=times,
                group_name=predict_group,
                prediction_functional_form=predict_space)
            tight_p_predictions = self.tight_p_model.predict(
                t=times,
                group_name=predict_group,
                prediction_functional_form=predict_space)
            p_predictions = convex_combination(t=times,
                                               pred1=tight_p_predictions,
                                               pred2=loose_p_predictions,
                                               pred_fun=predict_space,
                                               start_day=self.blend_start_t,
                                               end_day=self.blend_end_t)

        if (self.beta_weight > 0) & (self.p_weight > 0):
            averaged_predictions = model_average(pred1=beta_predictions,
                                                 pred2=p_predictions,
                                                 w1=self.beta_weight,
                                                 w2=self.p_weight,
                                                 pred_fun=predict_space)
        elif (self.beta_weight > 0) & (self.p_weight == 0):
            averaged_predictions = beta_predictions
        elif (self.beta_weight == 0) & (self.p_weight > 0):
            averaged_predictions = p_predictions
        else:
            raise RuntimeError
        return averaged_predictions
예제 #2
0
def test_convex_combination(t, mat1, mat2, pred_fun, start_day, end_day,
                            result):
    my_result = utils.convex_combination(t, mat1, mat2, pred_fun,
                                         start_day=start_day,
                                         end_day=end_day)

    assert np.allclose(result, my_result)
예제 #3
0
def ap_model(df,
             model_location,
             location_cov,
             n_draws,
             peaked_groups,
             exclude_groups,
             fix_gamma,
             fix_point,
             fix_day,
             pred_days=150):
    # our dataset (rename days as model assumes it's lower case)
    df = df.copy()
    df = df.rename(index=str, columns={'Days': 'days'})

    ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ##
    ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ## ##
    ## SET UP
    # basic information and model setting
    basic_info_dict = dict(all_cov_names=[COVARIATE],
                           col_t='days',
                           col_group='location',
                           predict_space=ln_gaussian_pdf,
                           col_obs_compare='ln asddr',
                           peaked_groups=peaked_groups)
    basic_model_dict = dict(
        param_names=['alpha', 'beta', 'p'],
        col_covs=[['intercept'], [COVARIATE], ['intercept']],
        link_fun=[np.exp, lambda x: x, np.exp],
        var_link_fun=[lambda x: x, lambda x: x, lambda x: x])

    # basic fit parameter
    dummy_gprior = [0.0, np.inf]
    dummy_uprior = [-np.inf, np.inf]
    zero_uprior = [0.0, 0.0]
    fe_init = np.array([-2.5, 28.0, -8.05])
    fe_bounds = [[-np.inf, 0.0], [15.0, 100.0], [-10, -6]]
    options = {'ftol': 1e-10, 'gtol': 1e-10, 'maxiter': 500, 'disp': False}
    basic_fit_dict = dict(fe_init=fe_init,
                          fe_bounds=fe_bounds,
                          re_bounds=[zero_uprior] * 3,
                          fe_gprior=[dummy_gprior] * 3,
                          re_gprior=[dummy_gprior] * 3,
                          options=options)
    basic_joint_model_fit_dict = dict(
        fe_gprior=[dummy_gprior] * 3,
        re_bounds=[dummy_uprior] * 3,
        re_gprior=[dummy_gprior, [0.0, 10.0], dummy_gprior],
        smart_initialize=True,
        smart_init_options=options,
        options={
            'ftol': 1e-10,
            'gtol': 1e-10,
            'maxiter': 10,
            'disp': False
        })

    # draw related parameters
    draw_dict = dict(n_draws=n_draws,
                     prediction_times=np.arange(pred_days),
                     cv_lower_threshold=1e-4,
                     cv_upper_threshold=1.,
                     smoothed_radius=[5, 5],
                     exclude_groups=exclude_groups,
                     exclude_below=0,
                     num_smooths=2)

    # for the convex combination
    start_day = 2
    end_day = 25

    # for prediction of places with no data
    alpha_times_beta = np.exp(0.7)
    obs_bounds = [40, np.inf]  # filter the data rich models
    predict_cov = np.array([1.0, location_cov,
                            1.0])  # new covariates for the places.

    # tight prior control panel
    tight_info_dict = {
        **deepcopy(basic_info_dict),
        'fun': ln_gaussian_cdf,
        'col_obs': 'ln ascdr',
        'col_obs_se': 'obs_se_tight',
        #'obs_se_func': lambda x: (1. / (1. + x)),
        'obs_se_func': None,
        'prior_modifier':
        lambda x: 10**(min(0.0, max(-1.0, 0.1 * x - 1.5))) / 10
    }
    tight_fit_dict = {
        **deepcopy(basic_fit_dict), 'fun_gprior':
        [lambda params: params[0] * params[1], [np.exp(0.7), 1.0]]
    }

    # loose prior control panel
    loose_info_dict = {
        **deepcopy(basic_info_dict),
        'fun': ln_gaussian_cdf,
        'col_obs': 'ln ascdr',
        'col_obs_se': 'obs_se_loose',
        #'obs_se_func': lambda x: (1 / (0.1 + x**1.4)),
        'obs_se_func': None,
        'prior_modifier': lambda x: 0.2
    }
    loose_fit_dict = {
        **deepcopy(basic_fit_dict), 'fun_gprior':
        [lambda params: params[0] * params[1], dummy_gprior]
    }

    # prepare data (must exponentiate smoothed column, non-logged col is not smoothed)
    df['obs_se_tight'] = 1 / (1 + df['days'])
    df['obs_se_loose'] = 1 / (1 + df['days']**1.4)
    df.loc[df['pseudo'] == 1, 'obs_se_tight'] = PSEUDO_SE
    df.loc[df['pseudo'] == 1, 'obs_se_loose'] = PSEUDO_SE
    df['Age-standardized death rate'] = np.exp(
        df['ln(age-standardized death rate)'])
    df = process_input(
        df,
        'location_id',
        'days',
        'Age-standardized death rate',
        col_covs=[COVARIATE, 'intercept', 'obs_se_tight', 'obs_se_loose'])

    #############
    # RUN MODEL #
    #############
    # set up last info
    if fix_point is not None:
        last_info = {model_location: [fix_day, fix_point]}
    else:
        last_info = None

    # The Alpha Prior Model
    tight_model = APModel(all_data=df,
                          **tight_info_dict,
                          joint_model_fit_dict=basic_joint_model_fit_dict,
                          basic_model_dict=basic_model_dict,
                          fit_dict=tight_fit_dict)
    if fix_gamma:
        fe_bounds = tight_model.fit_dict['fe_bounds']
        tight_model.fit_dict.update(
            {'fe_bounds': [fe_bounds[0], [1, 1], fe_bounds[2]]})
    tight_model.run(last_info=last_info, **draw_dict)
    loose_model = APModel(all_data=df,
                          **loose_info_dict,
                          joint_model_fit_dict=basic_joint_model_fit_dict,
                          basic_model_dict=basic_model_dict,
                          fit_dict=loose_fit_dict)
    if fix_gamma:
        fe_bounds = loose_model.fit_dict['fe_bounds']
        loose_model.fit_dict.update(
            {'fe_bounds': [fe_bounds[0], [1, 1], fe_bounds[2]]})
    loose_model.run(last_info=last_info, **draw_dict)

    # get truncated draws
    tight_draws = tight_model.process_draws(draw_dict['prediction_times'],
                                            last_info=last_info)
    loose_draws = loose_model.process_draws(draw_dict['prediction_times'],
                                            last_info=last_info)
    combined_draws = {}
    for group in tight_draws.keys():
        draws = convex_combination(
            np.arange(tight_draws[group][1].shape[1]),
            tight_draws[group][1][np.argsort(tight_draws[group][1][:, -1]), :],
            loose_draws[group][1][np.argsort(loose_draws[group][1][:, -1]), :],
            basic_info_dict['predict_space'],
            start_day=start_day,
            end_day=end_day)
        if group == model_location and fix_point is not None:
            last_obs = fix_point
        else:
            last_obs = tight_model.models[group].obs[-1]
        combined_draws.update({
            group: (tight_draws[group][0],
                    np.log(np.exp(last_obs) + np.exp(draws).cumsum(axis=1)))
        })

    # get overall draws
    filtered_tight_models = tight_model.run_filtered_models(
        df=tight_model.all_data, obs_bounds=obs_bounds)
    overall_tight_draws = tight_model.create_overall_draws(
        draw_dict['prediction_times'],
        filtered_tight_models,
        predict_cov,
        alpha_times_beta=alpha_times_beta,
        sample_size=draw_dict['n_draws'],
        slope_at=10,
        epsilon=draw_dict['cv_lower_threshold'])
    filtered_loose_models = loose_model.run_filtered_models(
        df=loose_model.all_data, obs_bounds=obs_bounds)
    overall_loose_draws = loose_model.create_overall_draws(
        draw_dict['prediction_times'],
        filtered_loose_models,
        predict_cov,
        alpha_times_beta=alpha_times_beta,
        sample_size=draw_dict['n_draws'],
        slope_at=10,
        epsilon=draw_dict['cv_lower_threshold'])

    # get specs and truncate overall, then combine
    if model_location in list(combined_draws.keys()):
        # last_day = tight_model.models[model_location].t[-1]
        if fix_day is None:
            last_day = tight_model.models[model_location].t[-1]
        else:
            last_day = fix_day
        if fix_point is not None:
            last_obs = fix_point
        else:
            last_obs = tight_model.models[model_location].obs[-1]
        overall_time = draw_dict['prediction_times'][int(np.round(last_day)):]
    else:
        if fix_day is None:
            last_day = draw_dict['prediction_times'][0]
        else:
            last_day = fix_day
        if fix_point is not None:
            last_obs = fix_point
        else:
            last_obs = RATE_THRESHOLD
        overall_time = np.arange(last_day, pred_days)
    overall_tight_draws = truncate_draws(
        t=draw_dict['prediction_times'],
        draws=overall_tight_draws,
        draw_space=basic_info_dict['predict_space'],
        last_day=last_day,
        last_obs=last_obs,
        last_obs_space=tight_info_dict['fun'])
    overall_loose_draws = truncate_draws(
        t=draw_dict['prediction_times'],
        draws=overall_loose_draws,
        draw_space=basic_info_dict['predict_space'],
        last_day=last_day,
        last_obs=last_obs,
        last_obs_space=loose_info_dict['fun'])
    draws = convex_combination(
        np.arange(overall_tight_draws.shape[1]),
        overall_tight_draws[np.argsort(overall_tight_draws[:, -1]), :],
        overall_loose_draws[np.argsort(overall_loose_draws[:, -1]), :],
        basic_info_dict['predict_space'],
        start_day=start_day,
        end_day=end_day)
    combined_draws.update({
        'overall': (overall_time[1:],
                    np.log(np.exp(last_obs) + np.exp(draws).cumsum(axis=1)))
    })

    return tight_model, loose_model, combined_draws
예제 #4
0
def run_death_models():
    # args = argparse.Namespace(
    #     cov_file='/ihme/covid-19/deaths/dev/2020_05_03_US_boundary/model_data_descartes_21/555_covariate.csv',
    #     covariate_effect='gamma',
    #     data_file='/ihme/covid-19/deaths/dev/2020_05_03_US_boundary/model_data_descartes_21/555.csv',
    #     last_day_file='/ihme/covid-19/deaths/dev/2020_05_03_US_boundary/last_day.csv',
    #     model_location_id=555,
    #     n_draws=333,
    #     n_b=43,
    #     output_dir='/ihme/covid-19/deaths/dev/2020_05_03_US_boundary/model_data_descartes_21/555',
    #     peaked_file='/ihme/covid-19/deaths/mobility_inputs/2020_04_20/peak_locs_april20_.csv'
    # )
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_location_id',
                        help='id of location to which we are standardizing.',
                        type=int)
    parser.add_argument('--data_file',
                        help='Name of location-standardized data file.',
                        type=str)
    parser.add_argument('--cov_file', help='Name of covariate file.', type=str)
    parser.add_argument('--last_day_file',
                        help='Name of last day of deaths file.',
                        type=str)
    parser.add_argument('--peaked_file',
                        help='Name of peaked locations file.',
                        type=str)
    parser.add_argument('--output_dir',
                        help='Where we are storing results.',
                        type=str)
    parser.add_argument('--covariate_effect',
                        help='Whether covariate is acting on beta or gamma.',
                        type=str)
    parser.add_argument('--n_draws',
                        help='How many samples to take.',
                        type=int)
    args = parser.parse_args()

    logger.info(args)
    # read data
    df = pd.read_csv(args.data_file)
    cov_df = pd.read_csv(args.cov_file)

    # only keep if more than one data point is present
    keep_idx = df.groupby('location_id')['location_id'].transform('count') > 1
    df = df[keep_idx].reset_index(drop=True)

    # try setting floor for covariate
    cov_df.loc[cov_df[COVARIATE] < 0.75, COVARIATE] = 0.75

    # attach covs to data file
    df = pd.merge(df, cov_df[['location_id', COVARIATE]], how='left')
    if df[COVARIATE].isnull().any():
        missing_locs = df.loc[df[COVARIATE].isnull(),
                              'Location'].unique().tolist()
        print(
            f'The following locations are missing covariates: {", ".join(missing_locs)}'
        )
        df = df.loc[~df[COVARIATE].isnull()]
    df = df.sort_values(['location_id',
                         'Days']).reset_index(drop=True)  # 'Country/Region',

    # encode location_id for more explicit str indexing in model
    df['location_id'] = '_' + df['location_id'].astype(str)

    # add intercept
    df['intercept'] = 1.0

    # identify covariate value for our location
    location_cov = cov_df.loc[cov_df['location_id'] == args.model_location_id,
                              COVARIATE].item()
    n_b = get_number_of_basis_functions(location_cov)

    # get list of peaked locations
    peaked_df = pd.read_csv(args.peaked_file)
    peaked_df['location_id'] = '_' + peaked_df['location_id'].astype(str)

    # get true ln(dr) on last day
    last_day_df = pd.read_csv(args.last_day_file)
    last_day_df = last_day_df.loc[last_day_df['location_id'] ==
                                  args.model_location_id]
    if last_day_df.empty:
        fix_point = None
        fix_day = None
    else:
        fix_point = last_day_df['ln(death rate)'].item()
        fix_day = last_day_df['Days'].item()

    ## run models
    model_seed = get_hash(f'_{args.model_location_id}')
    np.random.seed(model_seed)
    # AP model for data poor
    if len(df.loc[df['location_id'] ==
                  f'_{args.model_location_id}']) < DATA_THRESHOLD:
        logger.info('Running data poor model')
        # or df.loc[df['location_id'] == f'_{args.model_location_id}', 'Deaths'].max() < 5:
        #
        # are we using a beta or gamma covariate
        if args.covariate_effect == 'beta':
            fix_gamma = True
        elif args.covariate_effect == 'gamma':
            fix_gamma = False

        # alpha prior model (no flat top)
        tight_model, loose_model, draws = ap_model(
            df=df[[
                'location_id', 'intercept', 'Days', 'pseudo',
                'ln(age-standardized death rate)', COVARIATE
            ]],
            model_location=f'_{args.model_location_id}',
            location_cov=location_cov,
            n_draws=args.n_draws,
            peaked_groups=peaked_df.loc[peaked_df['location_id'].isin(
                df['location_id'].unique().tolist()), 'location_id'].to_list(),
            exclude_groups=peaked_df.loc[
                peaked_df['Location'].str.startswith('Wuhan'),
                'location_id'].unique().tolist(),
            fix_gamma=fix_gamma,
            fix_point=fix_point,
            fix_day=fix_day)
        model = 'AP'  # get point estimate
        d = pd.to_datetime(
            cov_df.loc[cov_df['location_id'] == args.model_location_id,
                       'threshold_date'].item())
        if f'_{args.model_location_id}' in list(draws.keys()):
            t = np.arange(PRED_DAYS)
            loose_asdr = loose_model.models[
                f'_{args.model_location_id}'].predict(
                    t, group_name=f'_{args.model_location_id}')
            tight_asdr = tight_model.models[
                f'_{args.model_location_id}'].predict(
                    t, group_name=f'_{args.model_location_id}')

            ln_asdr = convex_combination(t,
                                         loose_asdr,
                                         tight_asdr,
                                         ln_gaussian_cdf,
                                         start_day=fix_day + 2,
                                         end_day=fix_day + 25)
            asdr = np.exp(ln_asdr)
        else:
            t = draws[f'overall'][0]
            asdr = np.exp(draws[f'overall'][1]).mean(axis=0)

        # store output as daily
        asddr = asdr[1:] - asdr[:-1]
        point_df = pd.DataFrame({
            'location_id':
            args.model_location_id,
            'Date': [d + timedelta(days=int(t_i)) for t_i in t[1:]],
            'Age-standardized death rate':
            asddr
        })
    else:  # AP model for data rich
        logger.info('Running data rich model.')
        tight_model, draws = ap_flat_asym_model(
            df=df[[
                'location_id', 'intercept', 'Days', 'pseudo',
                'ln(age-standardized death rate)', COVARIATE
            ]],
            model_location=f'_{args.model_location_id}',
            n_draws=args.n_draws,
            peaked_groups=peaked_df.loc[peaked_df['location_id'].isin(
                df['location_id'].unique().tolist()), 'location_id'].to_list(),
            exclude_groups=peaked_df.loc[
                peaked_df['Location'].str.startswith('Wuhan'),
                'location_id'].unique().tolist(),
            fix_point=fix_point,
            fix_day=fix_day,
            n_b=n_b)
        loose_model = tight_model  # just to plug into plot
        model = 'AP flat asymmetrical'

        # get point estimate
        d = pd.to_datetime(
            cov_df.loc[cov_df['location_id'] == args.model_location_id,
                       'threshold_date'].item())
        t = np.arange(PRED_DAYS)
        asdr = np.exp(
            tight_model.predict(t, ln_gaussian_cdf,
                                f'_{args.model_location_id}'))
        asddr = asdr[1:] - asdr[:-1]
        point_df = pd.DataFrame({
            'location_id':
            args.model_location_id,
            'Date': [d + timedelta(days=int(t_i)) for t_i in t[1:]],
            'Age-standardized death rate':
            asddr
        })

    # only save this location and overall draws
    subset_draws = dict()
    for model_label in [f'_{args.model_location_id}', 'overall']:
        if model_label in list(draws.keys()):
            subset_draws.update({model_label: draws[model_label]})

    # store outputs
    # data
    df[[
        'location_id', 'intercept', 'Days', 'pseudo',
        'ln(age-standardized death rate)', COVARIATE
    ]].to_csv(f'{args.output_dir}/data.csv', index=False)
    # point estimate
    point_df.to_csv(f'{args.output_dir}/point_estimate.csv', index=False)
    # loose
    if model == 'AP':
        logger.info('Writing loose models.')
        with open(f'{args.output_dir}/loose_models.pkl', 'wb') as fwrite:
            pickle.dump(loose_model.models, fwrite, -1)
        with open(f'{args.output_dir}/loose_model_fit_dict.pkl',
                  'wb') as fwrite:
            pickle.dump(loose_model.fit_dict, fwrite, -1)
    else:
        # GM data
        logger.info('Writing Gaussian mixture metadata')
        with open(f'{args.output_dir}/gaussian_mixtures.pkl', 'wb') as fwrite:
            pickle.dump(tight_model.gaussian_mixtures, fwrite, -1)
    # tight
    logger.info('Writing tight models')
    with open(f'{args.output_dir}/tight_models.pkl', 'wb') as fwrite:
        pickle.dump(tight_model.models, fwrite, -1)
    with open(f'{args.output_dir}/tight_model_fit_dict.pkl', 'wb') as fwrite:
        pickle.dump(tight_model.fit_dict, fwrite, -1)
    # subset draws
    logger.info('Writing draws')
    with open(f'{args.output_dir}/draws.pkl', 'wb') as fwrite:
        pickle.dump(subset_draws, fwrite, -1)

    # plot (special condition if using multiple Gaussian)
    if model == 'AP':
        model_instance = None
    else:
        model_instance = tight_model
    logger.info('Writing model fit plots.')
    with PdfPages(f'{args.output_dir}/model_fits.pdf') as pdf:
        for location in tight_model.models.keys():
            location_name = df.loc[df['location_id'] == location,
                                   'Location'].values[0]
            plot_location(
                location=location,
                location_name=location_name,
                covariate_val=cov_df.loc[cov_df['Location'] == location_name,
                                         COVARIATE].item(),
                tm=tight_model.models[location],
                lm=loose_model.models[location],
                model_instance=model_instance,
                draw=draws[location],
                population=df.loc[df['location_id'] == location,
                                  'population'].values[0],
                pdf=pdf,
                n_b=n_b)