示例#1
0
def do_qsub(session, job_type: str, job_name: str, output_root: Path,
            script_args: List[str], process_key: str):
    error_logs = output_root / 'logs' / job_type / 'error'
    output_logs = output_root / 'logs' / job_type / 'output'
    shell_tools.mkdir(error_logs, exists_ok=True, parents=True)
    shell_tools.mkdir(output_logs, exists_ok=True, parents=True)

    job_template = session.createJobTemplate()
    job_template.remoteCommand = shutil.which('python')
    job_template.outputPath = f':{output_logs}'
    job_template.errorPath = f':{error_logs}'
    job_template.args = script_args
    job_template.nativeSpecification = (
        f'-V '  # Export all environment variables
        f'-b y '  # Command is a binary (python)
        f'-P {PROJECT} '
        f'-q {QUEUE[process_key]} '
        f"-l fmem={RESOURCES[process_key]['fmem']} "
        f"-l fthread={RESOURCES[process_key]['fthread']} "
        f'-l h_rt={H_RUNTIME} '
        f'-N {job_name}')  # Name of the job
    job = session.runJob(job_template)
    logger.info(f'Submitted job {job_name} with id {job}.')
    session.deleteJobTemplate(job_template)
    return job
示例#2
0
def test_mkdir_no_parents(mode: int, tmp_path: Path):
    tmp_path.rmdir()
    assert not tmp_path.exists()

    child_dir = tmp_path / 'child'
    with pytest.raises(FileNotFoundError):
        mkdir(child_dir, mode)
示例#3
0
def do_qsub(session,
            command: str,
            job_type: str,
            job_name: str,
            output_root: Path,
            script_args: List[str]):
    error_logs = output_root / 'logs' / job_type / 'error'
    output_logs = output_root / 'logs' / job_type / 'output'
    shell_tools.mkdir(error_logs, exists_ok=True, parents=True)
    shell_tools.mkdir(output_logs, exists_ok=True, parents=True)

    job_template = session.createJobTemplate()
    job_template.remoteCommand = command
    job_template.outputPath = f':{output_logs}'
    job_template.errorPath = f':{error_logs}'
    job_template.args = script_args
    job_template.nativeSpecification = (f'-V '  # Export all environment variables
                                        f'-b y '  # Command is a binary
                                        f'-P {PROJECT} '
                                        f'-q {QUEUE} '
                                        f'-l fmem={F_MEM} '
                                        f'-l fthread={F_THREAD} '
                                        f'-l h_rt={H_RUNTIME} '
                                        f'-N {job_name}')  # Name of the job
    job = session.runJob(job_template)
    logger.info(f'Submitted job {job_name} with id {job}.')
    session.deleteJobTemplate(job_template)
    return job
示例#4
0
def configure_logging_to_files(output_path: Path) -> None:
    log_path = output_path / paths.LOG_DIR
    mkdir(log_path, exists_ok=True)
    add_logging_sink(output_path / paths.LOG_DIR /
                     paths.DETAILED_LOG_FILE_NAME,
                     verbose=2,
                     serialize=True)
    add_logging_sink(output_path / paths.LOG_DIR / paths.LOG_FILE_NAME,
                     verbose=1)
示例#5
0
def test_mkdir_exists(mode: int, parents: bool, tmp_path: Path):
    assert tmp_path.exists()
    perms = oct(tmp_path.stat().st_mode)[-3:]

    with pytest.raises(FileExistsError):
        mkdir(tmp_path, mode, parents=parents)

    mkdir(tmp_path, mode, parents=parents, exists_ok=True)
    assert oct(tmp_path.stat().st_mode)[-3:] == perms
示例#6
0
def make_log_dirs(log_dir: Union[str, Path]) -> Tuple[str, str]:
    """Create log directories in output root and return the paths."""
    log_dir = Path(log_dir)
    std_out = log_dir / 'output'
    std_err = log_dir / 'error'
    shell_tools.mkdir(std_out, exists_ok=True, parents=True)
    shell_tools.mkdir(std_err, exists_ok=True, parents=True)

    return str(std_out), str(std_err)
示例#7
0
    def dump(self, data: pandas.DataFrame, key, strict=True):
        path = self.resolve_key(key)
        if not path.parent.is_dir():
            mkdir(path.parent, parents=True)
        else:
            if strict and path.exists():
                msg = f"Cannot dump data for key {key} - would overwrite"
                raise LookupError(msg)

        data.to_csv(path, index=False)
示例#8
0
def test_mkdir_parents(mode: int, tmp_path: Path):
    tmp_path.rmdir()
    assert not tmp_path.exists()

    child_dir = tmp_path / 'child'
    mkdir(child_dir, mode, parents=True)
    assert tmp_path.exists()
    assert oct(tmp_path.stat().st_mode)[-3:] == oct(mode)[-3:]
    assert child_dir.exists()
    assert oct(child_dir.stat().st_mode)[-3:] == oct(mode)[-3:]
示例#9
0
    def _setup_checkpoint_dir(output_root: Union[str, Path],
                              clear: bool) -> Path:
        checkpoint_dir = Path(output_root) / 'checkpoint'
        if clear and checkpoint_dir.exists():
            logger.debug(f'Clearing previous checkpoint data.')
            for p in checkpoint_dir.iterdir():
                p.unlink()
            checkpoint_dir.rmdir()

        logger.debug(f'Making checkpoint directory at {str(checkpoint_dir)}')
        mkdir(checkpoint_dir, exists_ok=True)
        return checkpoint_dir
示例#10
0
    def make_dirs(self):
        """Builds the local directory structure."""
        if self.read_only:
            raise RuntimeError(
                f"Tried to create directory structure when "
                f"{self.__class__.__name__} was in read_only mode. "
                f"Try instantiating with read_only=False.")

        logger.debug(
            f'Creating sub-directory structure for {self.__class__.__name__} '
            f'in {self.root_dir}.')
        for directory in self.directories:
            mkdir(directory, parents=True, exists_ok=True)
示例#11
0
def configure_logging_to_files(output_path: Path) -> None:
    """Sets up logging to a file in an output directory.

    Logs to files are done with the highest verbosity to allow
    for debugging if necessary.

    """
    log_path = output_path / paths.LOG_DIR
    mkdir(log_path, exists_ok=True)
    add_logging_sink(
        output_path / paths.LOG_DIR / paths.DETAILED_LOG_FILE_NAME,
        verbose=3,
        serialize=True,
    )
    add_logging_sink(
        output_path / paths.LOG_DIR / paths.LOG_FILE_NAME,
        verbose=3,
    )
示例#12
0
def build_directories(output_root: Path):
    logger.info('Creating directories.')
    model_in_dir = output_root / 'model_inputs'
    model_out_dir = output_root / 'model_outputs'
    plot_dir = output_root / 'plots'
    infections_draws_dir = output_root / 'infections_draws'
    shell_tools.mkdir(model_in_dir)
    shell_tools.mkdir(model_out_dir)
    shell_tools.mkdir(plot_dir)
    shell_tools.mkdir(infections_draws_dir)

    return model_in_dir, model_out_dir, plot_dir, infections_draws_dir
示例#13
0
def setup_directory_structure(output_root: Union[str, Path], with_production: bool = False) -> None:
    """Sets up a best and latest directory for results versioning.

    Parameters
    ----------
    output_root
        The root directory for all outputs.
    with_production
        If true, additionally sets up a `production-run` sub-directory within
        the primary output root.

    """
    mkdir(output_root, exists_ok=True, parents=True)
    output_root = Path(output_root).resolve()
    for link in [paths.BEST_LINK, paths.LATEST_LINK]:
        link_path = output_root / link
        if not link_path.is_symlink() and not link_path.exists():
            mkdir(link_path)

    if with_production:
        production_dir = output_root / paths.PRODUCTION_RUN
        mkdir(production_dir, exists_ok=True)
示例#14
0
def pipeline_wrapper(
    out_dir: Path,
    excess_mortality: bool,
    gbd: bool,
    vaccine_coverage_root: Path,
    variant_scaleup_root: Path,
    age_rates_root: Path,
    testing_root: Path,
    n_samples: int,
    day_0: pd.Timestamp = pd.Timestamp('2020-03-15'),
    pred_start_date: pd.Timestamp = pd.Timestamp('2019-11-01'),
    pred_end_date: pd.Timestamp = pd.Timestamp('2022-03-15'),
    correlate_samples: bool = True,
    bootstrap: bool = True,
    verbose: bool = True,
) -> Tuple:
    np.random.seed(15243)
    if verbose:
        logger.info('Loading variant, vaccine, and sero data.')
    hierarchy = model_inputs.hierarchy(out_dir)
    if gbd:
        gbd_hierarchy = model_inputs.hierarchy(out_dir, 'covid_gbd')
    else:
        gbd_hierarchy = model_inputs.hierarchy(out_dir,
                                               'covid_modeling_plus_zaf')
    adj_gbd_hierarchy = model_inputs.validate_hierarchies(
        hierarchy.copy(), gbd_hierarchy.copy())
    population = model_inputs.population(out_dir)
    age_spec_population = model_inputs.population(out_dir, by_age=True)
    population_lr, population_hr = age_standardization.get_risk_group_populations(
        age_spec_population)
    shared = {
        'hierarchy': hierarchy,
        'gbd_hierarchy': gbd_hierarchy,
        'adj_gbd_hierarchy': adj_gbd_hierarchy,
        'population': population,
        'age_spec_population': age_spec_population,
        'population_lr': population_lr,
        'population_hr': population_hr,
    }

    escape_variant_prevalence = estimates.variant_scaleup(variant_scaleup_root,
                                                          'escape',
                                                          verbose=verbose)
    severity_variant_prevalence = estimates.variant_scaleup(
        variant_scaleup_root, 'severity', verbose=verbose)
    vaccine_coverage = estimates.vaccine_coverage(vaccine_coverage_root,
                                                  pred_end_date)
    reported_seroprevalence, seroprevalence_samples = serology.load_seroprevalence_sub_vacccinated(
        out_dir,
        hierarchy,
        vaccine_coverage.copy(),
        n_samples=n_samples,
        correlate_samples=correlate_samples,
        bootstrap=bootstrap,
        verbose=verbose,
    )
    reported_sensitivity_data, sensitivity_data_samples = serology.load_sensitivity(
        out_dir,
        n_samples,
    )
    durations_samples = durations.get_duration_dist(n_samples)
    cross_variant_immunity_samples = cvi.get_cvi_dist(n_samples)
    variant_risk_ratio_samples = variant_severity.get_variant_severity_rr_dist(
        n_samples)

    covariate_options = [
        'obesity',
        'smoking',
        'diabetes',
        'ckd',
        'cancer',
        'copd',
        'cvd',
        'uhc',
        'haq',
    ]
    covariates = [
        db.obesity(adj_gbd_hierarchy),
        db.smoking(adj_gbd_hierarchy),
        db.diabetes(adj_gbd_hierarchy),
        db.ckd(adj_gbd_hierarchy),
        db.cancer(adj_gbd_hierarchy),
        db.copd(adj_gbd_hierarchy),
        db.cvd(adj_gbd_hierarchy),
        db.uhc(adj_gbd_hierarchy) / 100,
        db.haq(adj_gbd_hierarchy) / 100,
    ]

    if verbose:
        logger.info(
            'Identifying best covariate combinations and creating input data object.'
        )
    test_combinations = []
    for i in range(len(covariate_options)):
        test_combinations += [
            list(set(cc))
            for cc in itertools.combinations(covariate_options, i + 1)
        ]
    test_combinations = [
        cc for cc in test_combinations
        if len([c for c in cc if c in ['uhc', 'haq']]) <= 1
    ]
    selected_combinations = covariate_selection.covariate_selection(
        n_samples=n_samples,
        test_combinations=test_combinations,
        out_dir=out_dir,
        excess_mortality=excess_mortality,
        age_rates_root=age_rates_root,
        shared=shared,
        reported_seroprevalence=reported_seroprevalence,
        covariate_options=covariate_options,
        covariates=covariates,
        reported_sensitivity_data=reported_sensitivity_data,
        vaccine_coverage=vaccine_coverage,
        escape_variant_prevalence=escape_variant_prevalence,
        severity_variant_prevalence=severity_variant_prevalence,
        cross_variant_immunity_samples=cross_variant_immunity_samples,
        variant_risk_ratio_samples=variant_risk_ratio_samples,
        pred_start_date=pred_start_date,
        pred_end_date=pred_end_date,
        cutoff_pct=1.,
        durations={
            'sero_to_death':
            (int(round(np.mean(durations.EXPOSURE_TO_ADMISSION))) +
             int(round(np.mean(durations.ADMISSION_TO_DEATH))) -
             int(round(np.mean(durations.EXPOSURE_TO_SEROCONVERSION)))),
            'exposure_to_death':
            (int(round(np.mean(durations.EXPOSURE_TO_ADMISSION))) +
             int(round(np.mean(durations.ADMISSION_TO_DEATH)))),
            'exposure_to_seroconversion':
            int(round(np.mean(durations.EXPOSURE_TO_SEROCONVERSION)))
        },
    )

    idr_covariate_options = [
        ['haq'],
        ['uhc'],
        ['prop_65plus'],
        [],
    ]
    idr_covariate_pool = np.random.choice(idr_covariate_options, n_samples)

    day_inflection_options = [
        '2020-06-01',
        '2020-07-01',
        '2020-08-01',
        '2020-09-01',
        '2020-10-01',
        '2020-11-01',
        '2020-12-01',
        '2021-01-01',
        '2021-02-01',
        '2021-03-01',
    ]
    day_inflection_pool = np.random.choice(day_inflection_options, n_samples)
    day_inflection_pool = [pd.Timestamp(str(d)) for d in day_inflection_pool]

    inputs = {
        n: {
            'out_dir': out_dir,
            'orig_seroprevalence': seroprevalence,
            'shared': shared,
            'excess_mortality': excess_mortality,
            'sensitivity_data': sensitivity_data,
            'vaccine_coverage': vaccine_coverage,
            'escape_variant_prevalence': escape_variant_prevalence,
            'severity_variant_prevalence': severity_variant_prevalence,
            'age_rates_root': age_rates_root,
            'testing_root': testing_root,
            'day_inflection': day_inflection,
            'covariates': covariates,
            'covariate_list': covariate_list,
            'idr_covariate_list': idr_covariate_list,
            'cross_variant_immunity': cross_variant_immunity,
            'variant_risk_ratio': variant_risk_ratio,
            'durations': durations,
            'day_0': day_0,
            'pred_start_date': pred_start_date,
            'pred_end_date': pred_end_date,
            'verbose': verbose,
        }
        for n,
        (
            covariate_list,
            idr_covariate_list,
            seroprevalence,
            sensitivity_data,
            cross_variant_immunity,
            variant_risk_ratio,
            day_inflection,
            durations,
        ) in enumerate(
            zip(
                selected_combinations,
                idr_covariate_pool,
                seroprevalence_samples,
                sensitivity_data_samples,
                cross_variant_immunity_samples,
                variant_risk_ratio_samples,
                day_inflection_pool,
                durations_samples,
            ))
    }

    if verbose:
        logger.info('Storing inputs and submitting sero-sample jobs.')
    inputs_path = out_dir / 'pipeline_inputs.pkl'
    with inputs_path.open('wb') as file:
        pickle.dump(inputs, file, -1)
    pipeline_dir = out_dir / 'pipeline_outputs'
    shell_tools.mkdir(pipeline_dir)
    job_args_map = {
        n: [__file__, n, inputs_path, pipeline_dir]
        for n in range(n_samples)
    }
    if gbd:
        cluster.run_cluster_jobs('covid_rates_pipeline', out_dir, job_args_map,
                                 'gbd')
    else:
        cluster.run_cluster_jobs('covid_rates_pipeline', out_dir, job_args_map,
                                 'standard')

    pipeline_results = {}
    for n in range(n_samples):
        with (pipeline_dir / f'{n}_outputs.pkl').open('rb') as file:
            outputs = pickle.load(file)
        pipeline_results.update(outputs)

    em_data = estimates.excess_mortailty_scalars(excess_mortality)

    return pipeline_results, selected_combinations, \
           cross_variant_immunity_samples, variant_risk_ratio_samples, \
           reported_seroprevalence, reported_sensitivity_data, \
           escape_variant_prevalence, severity_variant_prevalence, \
           vaccine_coverage, em_data, hierarchy, population
def make_run_directory(output_root: Union[str, Path]) -> Path:
    """Convenience function for making a new run directory and getting its path."""
    run_directory = get_run_directory(output_root)
    mkdir(run_directory)
    return run_directory
示例#16
0
def write_seir_inputs(
    model_out_dir: Path,
    infections_draws_dir: Path,
    output_root: Path,
    hierarchy: pd.DataFrame,
    infections_draws: pd.DataFrame,
    em_scalar_data: pd.Series,
    deaths: pd.Series,
    estimated_ratios: Dict,
    variant_risk_ratio: List,
    sero_data: pd.DataFrame,
    sub_infections_draws: pd.DataFrame,
    n_draws: int,
    durations: List,
    reported_deaths: pd.Series,
):
    most_detailed = hierarchy['most_detailed'] == 1
    location_ids = hierarchy.loc[most_detailed, 'location_id'].to_list()

    logger.info('Writing SEIR inputs - infections draw files.')
    infections_draws_cols = infections_draws.columns
    infections_draws = pd.concat([infections_draws, deaths], axis=1)
    infections_draws = infections_draws.sort_index()
    deaths = infections_draws['deaths'].copy()
    infections_draws = [
        infections_draws[infections_draws_col].copy()
        for infections_draws_col in infections_draws_cols
    ]
    infections_draws = [
        pd.concat([
            infections_draw, (deaths * em_scalar_data.loc[n]).rename('deaths')
        ],
                  axis=1) for n, infections_draw in enumerate(infections_draws)
    ]

    if not sub_infections_draws.empty:
        _, _, ifr = estimated_ratios['deaths']
        _get_sub_loc_deaths = functools.partial(
            aggregation.get_sub_loc_deaths,
            n_draws=n_draws,
            sub_infections_draws=sub_infections_draws.copy(),
            ifr=ifr.copy(),
            durations=durations.copy(),
            reported_deaths=reported_deaths.copy(),
        )
        with multiprocessing.Pool(MP_THREADS) as p:
            sub_deaths_scalar = list(
                tqdm(p.imap(_get_sub_loc_deaths, SUB_LOCATIONS),
                     total=len(SUB_LOCATIONS),
                     file=sys.stdout))
        sub_deaths = pd.concat([sds[0]
                                for sds in sub_deaths_scalar]).sort_index()
        sub_em_scalar_data = pd.concat([sds[1] for sds in sub_deaths_scalar
                                        ]).sort_index()
        del sub_deaths_scalar
        sub_infections_draws = [
            pd.concat([
                sub_infections_draws[infections_draws_col].copy(),
                sub_deaths[infections_draws_col].rename('deaths')
            ],
                      axis=1) for infections_draws_col in infections_draws_cols
        ]
        infections_draws = [
            pd.concat([i_d, s_i_d]).sort_index()
            for i_d, s_i_d in zip(infections_draws, sub_infections_draws)
        ]
        em_scalar_data = pd.concat([em_scalar_data,
                                    sub_em_scalar_data]).sort_index()

    _inf_writer = functools.partial(
        data.write_infections_draws,
        infections_draws_dir=infections_draws_dir,
    )
    with multiprocessing.Pool(MP_THREADS) as p:
        infections_draws_paths = list(
            tqdm(p.imap(_inf_writer, infections_draws),
                 total=n_draws,
                 file=sys.stdout))

    for measure, (estimated_ratio, measure_durations,
                  ratio_prior_estimates) in estimated_ratios.items():
        logger.info(f'Compiling {estimated_ratio.upper()} draws.')
        ratio_draws = []
        for draws_path in [
                result_path for result_path in model_out_dir.iterdir() if str(
                    result_path).endswith(f'_{estimated_ratio}_draws.parquet')
        ]:
            ratio_draws.append(pd.read_parquet(draws_path))
        ratio_draws = pd.concat(ratio_draws)
        if estimated_ratio == 'ifr':
            ratio_rr_draws = []
            for draws_path in [
                    result_path for result_path in model_out_dir.iterdir()
                    if str(result_path).endswith(
                        f'_{estimated_ratio}_rr_draws.parquet')
            ]:
                ratio_rr_draws.append(pd.read_parquet(draws_path))
            ratio_rr_draws = pd.concat(ratio_rr_draws)

        logger.info(
            f'Filling {estimated_ratio.upper()} with original model estimate where we do not have a posterior.'
        )
        ratio_draws_cols = ratio_draws.columns
        ratio_prior_estimates = (ratio_prior_estimates.reset_index(
        ).loc[:, ['location_id', 'draw', 'date', 'ratio']])
        ratio_prior_estimates = pd.pivot_table(
            ratio_prior_estimates,
            index=['location_id', 'date'],
            columns='draw',
            values='ratio',
        )
        ratio_prior_estimates.columns = ratio_draws_cols
        ratio_locations = ratio_draws.reset_index()['location_id'].unique()
        ratio_prior_locations = ratio_prior_estimates.reset_index(
        )['location_id'].unique()
        missing_locations = [
            l for l in ratio_prior_locations if l not in ratio_locations
        ]
        missing_locations = [l for l in missing_locations if l in location_ids]
        ratio_prior_estimates = ratio_prior_estimates.loc[missing_locations]
        if len(ratio_prior_estimates) > 0:
            logger.info(
                f"Appending prior estimates for the following locations: "
                f"{', '.join(ratio_prior_estimates.reset_index()['location_id'].astype(str).unique())}"
            )
        ratio_draws = ratio_draws.append(ratio_prior_estimates)

        logger.info(
            f'Writing SEIR inputs - {estimated_ratio.upper()} draw files.')
        ratio_draws = ratio_draws.sort_index()
        if estimated_ratio == 'ifr':
            ifr_lr_rr = (ratio_rr_draws.reset_index().
                         loc[:, ['location_id', 'draw', 'date', 'ifr_lr_rr']])
            ifr_lr_rr = pd.pivot_table(
                ifr_lr_rr,
                index=['location_id', 'date'],
                columns='draw',
                values='ifr_lr_rr',
            )
            ifr_lr_rr.columns = ratio_draws_cols
            ifr_hr_rr = (ratio_rr_draws.reset_index().
                         loc[:, ['location_id', 'draw', 'date', 'ifr_hr_rr']])
            ifr_hr_rr = pd.pivot_table(
                ifr_hr_rr,
                index=['location_id', 'date'],
                columns='draw',
                values='ifr_hr_rr',
            )
            ifr_hr_rr.columns = ratio_draws_cols
            ratio_draws = [(
                ratio_draws[ratio_draws_col].copy(),
                ifr_lr_rr[ratio_draws_col].copy(),
                ifr_hr_rr[ratio_draws_col].copy(),
            ) for ratio_draws_col in ratio_draws_cols]
        else:
            ratio_draws = [[ratio_draws[ratio_draws_col].copy()]
                           for ratio_draws_col in ratio_draws_cols]
        ratio_draws_dir = output_root / f'{estimated_ratio}_draws'
        shell_tools.mkdir(ratio_draws_dir)
        _ratio_writer = functools.partial(
            data.write_ratio_draws,
            estimated_ratio=estimated_ratio,
            durations=measure_durations,
            variant_risk_ratio=variant_risk_ratio,
            ratio_draws_dir=ratio_draws_dir,
        )
        with multiprocessing.Pool(MP_THREADS) as p:
            ratio_draws_paths = list(
                tqdm(p.imap(_ratio_writer, ratio_draws),
                     total=n_draws,
                     file=sys.stdout))

    logger.info('Writing serology data and EM scaling factor data.')
    em_path = output_root / 'em_data.csv'
    em_scalar_data = (
        infections_draws[0].reset_index().loc[:,
                                              ['location_id', 'date']].merge(
                                                  em_scalar_data.reset_index(),
                                                  how='left'))
    em_scalar_data['em_scalar'] = em_scalar_data['em_scalar'].fillna(1)
    em_scalar_data.to_csv(em_path, index=False)
    # em_scalar_data['date'] = em_scalar_data['date'].astype(str)
    # em_path = output_root / 'em_data.parquet'
    # em_scalar_data.to_parquet(em_path, engine='fastparquet', compression='gzip')
    sero_data['included'] = 1 - sero_data['is_outlier']
    sero_data = sero_data.rename(columns={'sero_sample_mean': 'value'})
    sero_data = sero_data.loc[:, ['included', 'value']]
    sero_path = output_root / 'sero_data.csv'
    sero_data.reset_index().to_csv(sero_path, index=False)
示例#17
0
def test_mkdir_no_args(mode: int, tmp_path: Path):
    tmp_path.rmdir()
    assert not tmp_path.exists()
    mkdir(tmp_path, mode=mode)
    assert tmp_path.exists()
    assert oct(tmp_path.stat().st_mode)[-3:] == oct(mode)[-3:]
示例#18
0
def make_deaths(app_metadata: cli_tools.Metadata, input_root: Path,
                output_root: Path, holdout_days: int, dow_holdouts: int,
                n_draws: int):
    logger.debug(f"Setting up output directories in {str(output_root)}.")
    model_dir = output_root / 'models'
    spline_settings_dir = output_root / 'spline_settings'
    plot_dir = output_root / 'plots'
    shell_tools.mkdir(model_dir)
    shell_tools.mkdir(spline_settings_dir)
    shell_tools.mkdir(plot_dir)

    logger.debug("Loading and cleaning data.")
    hierarchy = data.load_most_detailed_locations(input_root)
    agg_hierarchy = data.load_aggregate_locations(input_root)

    full_data = data.load_full_data(input_root)
    full_data, manipulation_metadata = data.evil_doings(full_data)
    app_metadata.update({'data_manipulation': manipulation_metadata})

    death_data = data.get_death_data(full_data)
    max_death_date = (death_data.groupby('location_id')['Date'].max().rename(
        'max_death_date').reset_index())
    case_data = data.get_shifted_data(full_data, 'Confirmed',
                                      'Confirmed case rate')
    case_data = case_data.merge(max_death_date)
    case_data = case_data.loc[
        case_data['True date'] <= case_data['max_death_date']]
    del case_data['max_death_date']
    hosp_data = data.get_shifted_data(full_data, 'Hospitalizations',
                                      'Hospitalization rate')
    hosp_data = hosp_data.merge(max_death_date)
    hosp_data = hosp_data.loc[
        hosp_data['True date'] <= hosp_data['max_death_date']]
    del hosp_data['max_death_date']
    del max_death_date
    pop_data = data.get_population_data(input_root, hierarchy)

    logger.debug(f"Dropping {holdout_days} days from the end of the data.")
    case_data = data.holdout_days(case_data, holdout_days)
    hosp_data = data.holdout_days(hosp_data, holdout_days)
    death_data = data.holdout_days(death_data, holdout_days)

    logger.debug("Filtering data by location.")
    case_data, missing_cases = data.filter_data_by_location(
        case_data, hierarchy, 'cases')
    hosp_data, missing_hosp = data.filter_data_by_location(
        hosp_data, hierarchy, 'hospitalizations')
    death_data, missing_deaths = data.filter_data_by_location(
        death_data, hierarchy, 'deaths')
    pop_data, missing_pop = data.filter_data_by_location(
        pop_data, hierarchy, 'population')

    logger.debug("Combine datasets.")
    model_data = data.combine_data(case_data, hosp_data, death_data, pop_data,
                                   hierarchy)
    model_data = model_data.sort_values(['location_id',
                                         'Date']).reset_index(drop=True)
    model_data = data.drop_leading_zeros(
        model_data,
        ['Death rate'])  # , 'Confirmed case rate', 'Hospitalization rate'

    logger.debug("Create aggregates for modeling.")
    agg_locations = [
        aggregate.Location(lid, lname) for lid, lname in zip(
            agg_hierarchy['location_id'], agg_hierarchy['location_name'])
    ]
    agg_model_data = aggregate.compute_location_aggregates_data(
        model_data, hierarchy, agg_locations,
        ['Confirmed case rate', 'Hospitalization rate', 'Death rate'])
    model_data = model_data.append(agg_model_data)
    model_data = model_data.sort_values(['location_id',
                                         'Date']).reset_index(drop=True)

    logger.debug("Filter cases/hospitalizations based on threshold.")
    model_data, dropped_locations, no_cases_locs, no_hosp_locs = data.filter_to_epi_threshold(
        hierarchy, model_data, death_threshold=5, epi_threshold=10)
    app_metadata.update({'dropped_locations': dropped_locations})

    logger.debug("Preparing model settings.")
    model_settings = {}
    s1_settings = {
        'dep_var': 'Death rate',
        'model_dir': str(model_dir),
        'indep_vars': []
    }
    cfr_settings = {'spline_var': 'Confirmed case rate', 'model_type': 'CFR'}
    cfr_settings.update(s1_settings)
    model_settings.update({'CFR': cfr_settings})
    hfr_settings = {'spline_var': 'Hospitalization rate', 'model_type': 'HFR'}
    hfr_settings.update(s1_settings)
    model_settings.update({'HFR': hfr_settings})
    smoother_settings = {
        'obs_var': 'Death rate',
        'pred_vars':
        ['Predicted death rate (CFR)', 'Predicted death rate (HFR)'],
        'spline_vars': ['Confirmed case rate', 'Hospitalization rate'],
        'spline_settings_dir': str(spline_settings_dir)
    }
    model_settings.update({'smoother': smoother_settings})
    model_settings['no_cases_locs'] = no_cases_locs
    model_settings['no_hosp_locs'] = no_hosp_locs

    logger.debug("Launching models by location.")
    working_dir = output_root / 'model_working_dir'
    shell_tools.mkdir(working_dir)
    data_path = Path(working_dir) / 'model_data.pkl'
    with data_path.open('wb') as data_file:
        pickle.dump(model_data, data_file, -1)
    results_path = Path(working_dir) / 'model_outputs'
    shell_tools.mkdir(results_path)
    model_settings['results_dir'] = str(results_path)
    settings_path = Path(working_dir) / 'settings.yaml'
    with settings_path.open('w') as settings_file:
        yaml.dump(model_settings, settings_file)
    job_args_map = {
        location_id: [
            models.__file__, location_id, data_path, settings_path,
            dow_holdouts,
            str(plot_dir), n_draws, cluster.OMP_NUM_THREADS
        ]
        for location_id in model_data['location_id'].unique()
        if location_id not in PARENT_MODEL_LOCATIONS
    }
    cluster.run_cluster_jobs('covid_death_models', output_root, job_args_map)

    logger.debug("Compiling results.")
    results = []
    for result_path in results_path.iterdir():
        with result_path.open('rb') as result_file:
            results.append(pickle.load(result_file))
    post_model_data = pd.concat([r.model_data
                                 for r in results]).reset_index(drop=True)
    noisy_draws = pd.concat([r.noisy_draws
                             for r in results]).reset_index(drop=True)
    smooth_draws = pd.concat([r.smooth_draws
                              for r in results]).reset_index(drop=True)
    failed_model_locations = (
        model_data.loc[~model_data['location_id'].
                       isin(post_model_data['location_id'].to_list()),
                       'location_id'].unique().tolist())
    failed_model_locations = [
        l for l in failed_model_locations if l not in PARENT_MODEL_LOCATIONS
    ]
    failed_model_locations = [
        l for l in failed_model_locations
        if l in hierarchy['location_id'].to_list()
    ]
    app_metadata.update({'failed_model_locations': failed_model_locations})
    model_data = post_model_data.append(
        model_data.loc[model_data['location_id'].isin(PARENT_MODEL_LOCATIONS)])
    obs_var = smoother_settings['obs_var']
    spline_vars = smoother_settings['spline_vars']

    logger.debug("Capturing location-dates with NaNs and dropping them.")
    nan_rows = smooth_draws.isnull().any(axis=1)
    smooth_draws_nans = smooth_draws.loc[nan_rows].reset_index(drop=True)
    smooth_draws = smooth_draws.loc[~nan_rows].reset_index(drop=True)
    nan_min = smooth_draws_nans.groupby('location_id')['date'].min()
    val_max = smooth_draws.groupby('location_id')['date'].max()
    date_diffs = (nan_min - val_max).apply(lambda x: x.days)
    date_diffs = date_diffs.loc[date_diffs.notnull()]
    app_metadata.update({'nan_locations': date_diffs.index.to_list()})
    if (date_diffs < 0).any():
        date_diffs.to_csv(output_root / 'problem_location_report.csv',
                          index=False)
        raise ValueError(
            'Dropping NaNs in middle of time series (see problem_location_report.csv)'
        )

    logger.debug("Fill specified model locations with parent and plot them.")
    smooth_draws, model_data = data.apply_parents(PARENT_MODEL_LOCATIONS,
                                                  hierarchy, smooth_draws,
                                                  model_data, pop_data)
    summarize.summarize_and_plot(
        smooth_draws.loc[smooth_draws['location_id'].isin(
            PARENT_MODEL_LOCATIONS)].rename(columns={'date': 'Date'}),
        model_data.loc[model_data['location_id'].isin(PARENT_MODEL_LOCATIONS)],
        str(plot_dir),
        obs_var=obs_var,
        spline_vars=spline_vars,
        pop_data=pop_data)
    app_metadata.update({'parent_model_locations': PARENT_MODEL_LOCATIONS})

    logger.debug("Make post-model aggregates and plot them.")
    agg_locations = [aggregate.Location(1, 'Global')] + agg_locations
    agg_model_data = aggregate.compute_location_aggregates_data(
        model_data, hierarchy, agg_locations)
    agg_model_data['location_id'] = -agg_model_data['location_id']
    agg_model_data['location_name'] = agg_model_data[
        'location_name'] + ' (model aggregate)'
    agg_draw_df = aggregate.compute_location_aggregates_draws(
        smooth_draws.rename(columns={'date': 'Date'}), hierarchy,
        agg_locations)
    agg_draw_df['location_id'] = -agg_draw_df['location_id']
    summarize.summarize_and_plot(agg_draw_df,
                                 agg_model_data,
                                 str(plot_dir),
                                 obs_var=obs_var,
                                 spline_vars=spline_vars)

    logger.debug("Compiling plots.")
    plot_hierarchy = aggregate.get_sorted_hierarchy_w_aggs(
        hierarchy, agg_hierarchy)
    possible_pdfs = ['-1.pdf'
                     ] + [f'{l}.pdf' for l in plot_hierarchy.location_id]
    existing_pdfs = [
        str(x).split('/')[-1] for x in plot_dir.iterdir() if x.is_file()
    ]
    pdfs = [
        f'{plot_dir}/{pdf}' for pdf in possible_pdfs if pdf in existing_pdfs
    ]
    pdf_merger.pdf_merger(pdfs=pdfs,
                          outfile=str(output_root / 'model_results.pdf'))

    logger.debug(f"Writing output data in {str(output_root)}.")
    model_data = model_data.rename(columns={
        'Date': 'date'
    }).set_index(['location_id', 'date'])
    noisy_draws = noisy_draws.set_index(['location_id', 'date'])
    noisy_draws['observed'] = model_data['Death rate'].notnull().astype(int)
    smooth_draws = smooth_draws.set_index(['location_id', 'date'])
    smooth_draws['observed'] = model_data['Death rate'].notnull().astype(int)
    model_data.rename(columns={
        'date': 'Date'
    }).reset_index().to_csv(output_root / 'model_data.csv', index=False)
    noisy_draws.reset_index().to_csv(output_root / 'model_results.csv',
                                     index=False)
    smooth_draws.reset_index().to_csv(output_root / 'model_results_refit.csv',
                                      index=False)
    smooth_draws_nans.to_csv(output_root / 'model_results_refit_nans.csv',
                             index=False)