def do_qsub(session, job_type: str, job_name: str, output_root: Path, script_args: List[str], process_key: str): error_logs = output_root / 'logs' / job_type / 'error' output_logs = output_root / 'logs' / job_type / 'output' shell_tools.mkdir(error_logs, exists_ok=True, parents=True) shell_tools.mkdir(output_logs, exists_ok=True, parents=True) job_template = session.createJobTemplate() job_template.remoteCommand = shutil.which('python') job_template.outputPath = f':{output_logs}' job_template.errorPath = f':{error_logs}' job_template.args = script_args job_template.nativeSpecification = ( f'-V ' # Export all environment variables f'-b y ' # Command is a binary (python) f'-P {PROJECT} ' f'-q {QUEUE[process_key]} ' f"-l fmem={RESOURCES[process_key]['fmem']} " f"-l fthread={RESOURCES[process_key]['fthread']} " f'-l h_rt={H_RUNTIME} ' f'-N {job_name}') # Name of the job job = session.runJob(job_template) logger.info(f'Submitted job {job_name} with id {job}.') session.deleteJobTemplate(job_template) return job
def test_mkdir_no_parents(mode: int, tmp_path: Path): tmp_path.rmdir() assert not tmp_path.exists() child_dir = tmp_path / 'child' with pytest.raises(FileNotFoundError): mkdir(child_dir, mode)
def do_qsub(session, command: str, job_type: str, job_name: str, output_root: Path, script_args: List[str]): error_logs = output_root / 'logs' / job_type / 'error' output_logs = output_root / 'logs' / job_type / 'output' shell_tools.mkdir(error_logs, exists_ok=True, parents=True) shell_tools.mkdir(output_logs, exists_ok=True, parents=True) job_template = session.createJobTemplate() job_template.remoteCommand = command job_template.outputPath = f':{output_logs}' job_template.errorPath = f':{error_logs}' job_template.args = script_args job_template.nativeSpecification = (f'-V ' # Export all environment variables f'-b y ' # Command is a binary f'-P {PROJECT} ' f'-q {QUEUE} ' f'-l fmem={F_MEM} ' f'-l fthread={F_THREAD} ' f'-l h_rt={H_RUNTIME} ' f'-N {job_name}') # Name of the job job = session.runJob(job_template) logger.info(f'Submitted job {job_name} with id {job}.') session.deleteJobTemplate(job_template) return job
def configure_logging_to_files(output_path: Path) -> None: log_path = output_path / paths.LOG_DIR mkdir(log_path, exists_ok=True) add_logging_sink(output_path / paths.LOG_DIR / paths.DETAILED_LOG_FILE_NAME, verbose=2, serialize=True) add_logging_sink(output_path / paths.LOG_DIR / paths.LOG_FILE_NAME, verbose=1)
def test_mkdir_exists(mode: int, parents: bool, tmp_path: Path): assert tmp_path.exists() perms = oct(tmp_path.stat().st_mode)[-3:] with pytest.raises(FileExistsError): mkdir(tmp_path, mode, parents=parents) mkdir(tmp_path, mode, parents=parents, exists_ok=True) assert oct(tmp_path.stat().st_mode)[-3:] == perms
def make_log_dirs(log_dir: Union[str, Path]) -> Tuple[str, str]: """Create log directories in output root and return the paths.""" log_dir = Path(log_dir) std_out = log_dir / 'output' std_err = log_dir / 'error' shell_tools.mkdir(std_out, exists_ok=True, parents=True) shell_tools.mkdir(std_err, exists_ok=True, parents=True) return str(std_out), str(std_err)
def dump(self, data: pandas.DataFrame, key, strict=True): path = self.resolve_key(key) if not path.parent.is_dir(): mkdir(path.parent, parents=True) else: if strict and path.exists(): msg = f"Cannot dump data for key {key} - would overwrite" raise LookupError(msg) data.to_csv(path, index=False)
def test_mkdir_parents(mode: int, tmp_path: Path): tmp_path.rmdir() assert not tmp_path.exists() child_dir = tmp_path / 'child' mkdir(child_dir, mode, parents=True) assert tmp_path.exists() assert oct(tmp_path.stat().st_mode)[-3:] == oct(mode)[-3:] assert child_dir.exists() assert oct(child_dir.stat().st_mode)[-3:] == oct(mode)[-3:]
def _setup_checkpoint_dir(output_root: Union[str, Path], clear: bool) -> Path: checkpoint_dir = Path(output_root) / 'checkpoint' if clear and checkpoint_dir.exists(): logger.debug(f'Clearing previous checkpoint data.') for p in checkpoint_dir.iterdir(): p.unlink() checkpoint_dir.rmdir() logger.debug(f'Making checkpoint directory at {str(checkpoint_dir)}') mkdir(checkpoint_dir, exists_ok=True) return checkpoint_dir
def make_dirs(self): """Builds the local directory structure.""" if self.read_only: raise RuntimeError( f"Tried to create directory structure when " f"{self.__class__.__name__} was in read_only mode. " f"Try instantiating with read_only=False.") logger.debug( f'Creating sub-directory structure for {self.__class__.__name__} ' f'in {self.root_dir}.') for directory in self.directories: mkdir(directory, parents=True, exists_ok=True)
def configure_logging_to_files(output_path: Path) -> None: """Sets up logging to a file in an output directory. Logs to files are done with the highest verbosity to allow for debugging if necessary. """ log_path = output_path / paths.LOG_DIR mkdir(log_path, exists_ok=True) add_logging_sink( output_path / paths.LOG_DIR / paths.DETAILED_LOG_FILE_NAME, verbose=3, serialize=True, ) add_logging_sink( output_path / paths.LOG_DIR / paths.LOG_FILE_NAME, verbose=3, )
def build_directories(output_root: Path): logger.info('Creating directories.') model_in_dir = output_root / 'model_inputs' model_out_dir = output_root / 'model_outputs' plot_dir = output_root / 'plots' infections_draws_dir = output_root / 'infections_draws' shell_tools.mkdir(model_in_dir) shell_tools.mkdir(model_out_dir) shell_tools.mkdir(plot_dir) shell_tools.mkdir(infections_draws_dir) return model_in_dir, model_out_dir, plot_dir, infections_draws_dir
def setup_directory_structure(output_root: Union[str, Path], with_production: bool = False) -> None: """Sets up a best and latest directory for results versioning. Parameters ---------- output_root The root directory for all outputs. with_production If true, additionally sets up a `production-run` sub-directory within the primary output root. """ mkdir(output_root, exists_ok=True, parents=True) output_root = Path(output_root).resolve() for link in [paths.BEST_LINK, paths.LATEST_LINK]: link_path = output_root / link if not link_path.is_symlink() and not link_path.exists(): mkdir(link_path) if with_production: production_dir = output_root / paths.PRODUCTION_RUN mkdir(production_dir, exists_ok=True)
def pipeline_wrapper( out_dir: Path, excess_mortality: bool, gbd: bool, vaccine_coverage_root: Path, variant_scaleup_root: Path, age_rates_root: Path, testing_root: Path, n_samples: int, day_0: pd.Timestamp = pd.Timestamp('2020-03-15'), pred_start_date: pd.Timestamp = pd.Timestamp('2019-11-01'), pred_end_date: pd.Timestamp = pd.Timestamp('2022-03-15'), correlate_samples: bool = True, bootstrap: bool = True, verbose: bool = True, ) -> Tuple: np.random.seed(15243) if verbose: logger.info('Loading variant, vaccine, and sero data.') hierarchy = model_inputs.hierarchy(out_dir) if gbd: gbd_hierarchy = model_inputs.hierarchy(out_dir, 'covid_gbd') else: gbd_hierarchy = model_inputs.hierarchy(out_dir, 'covid_modeling_plus_zaf') adj_gbd_hierarchy = model_inputs.validate_hierarchies( hierarchy.copy(), gbd_hierarchy.copy()) population = model_inputs.population(out_dir) age_spec_population = model_inputs.population(out_dir, by_age=True) population_lr, population_hr = age_standardization.get_risk_group_populations( age_spec_population) shared = { 'hierarchy': hierarchy, 'gbd_hierarchy': gbd_hierarchy, 'adj_gbd_hierarchy': adj_gbd_hierarchy, 'population': population, 'age_spec_population': age_spec_population, 'population_lr': population_lr, 'population_hr': population_hr, } escape_variant_prevalence = estimates.variant_scaleup(variant_scaleup_root, 'escape', verbose=verbose) severity_variant_prevalence = estimates.variant_scaleup( variant_scaleup_root, 'severity', verbose=verbose) vaccine_coverage = estimates.vaccine_coverage(vaccine_coverage_root, pred_end_date) reported_seroprevalence, seroprevalence_samples = serology.load_seroprevalence_sub_vacccinated( out_dir, hierarchy, vaccine_coverage.copy(), n_samples=n_samples, correlate_samples=correlate_samples, bootstrap=bootstrap, verbose=verbose, ) reported_sensitivity_data, sensitivity_data_samples = serology.load_sensitivity( out_dir, n_samples, ) durations_samples = durations.get_duration_dist(n_samples) cross_variant_immunity_samples = cvi.get_cvi_dist(n_samples) variant_risk_ratio_samples = variant_severity.get_variant_severity_rr_dist( n_samples) covariate_options = [ 'obesity', 'smoking', 'diabetes', 'ckd', 'cancer', 'copd', 'cvd', 'uhc', 'haq', ] covariates = [ db.obesity(adj_gbd_hierarchy), db.smoking(adj_gbd_hierarchy), db.diabetes(adj_gbd_hierarchy), db.ckd(adj_gbd_hierarchy), db.cancer(adj_gbd_hierarchy), db.copd(adj_gbd_hierarchy), db.cvd(adj_gbd_hierarchy), db.uhc(adj_gbd_hierarchy) / 100, db.haq(adj_gbd_hierarchy) / 100, ] if verbose: logger.info( 'Identifying best covariate combinations and creating input data object.' ) test_combinations = [] for i in range(len(covariate_options)): test_combinations += [ list(set(cc)) for cc in itertools.combinations(covariate_options, i + 1) ] test_combinations = [ cc for cc in test_combinations if len([c for c in cc if c in ['uhc', 'haq']]) <= 1 ] selected_combinations = covariate_selection.covariate_selection( n_samples=n_samples, test_combinations=test_combinations, out_dir=out_dir, excess_mortality=excess_mortality, age_rates_root=age_rates_root, shared=shared, reported_seroprevalence=reported_seroprevalence, covariate_options=covariate_options, covariates=covariates, reported_sensitivity_data=reported_sensitivity_data, vaccine_coverage=vaccine_coverage, escape_variant_prevalence=escape_variant_prevalence, severity_variant_prevalence=severity_variant_prevalence, cross_variant_immunity_samples=cross_variant_immunity_samples, variant_risk_ratio_samples=variant_risk_ratio_samples, pred_start_date=pred_start_date, pred_end_date=pred_end_date, cutoff_pct=1., durations={ 'sero_to_death': (int(round(np.mean(durations.EXPOSURE_TO_ADMISSION))) + int(round(np.mean(durations.ADMISSION_TO_DEATH))) - int(round(np.mean(durations.EXPOSURE_TO_SEROCONVERSION)))), 'exposure_to_death': (int(round(np.mean(durations.EXPOSURE_TO_ADMISSION))) + int(round(np.mean(durations.ADMISSION_TO_DEATH)))), 'exposure_to_seroconversion': int(round(np.mean(durations.EXPOSURE_TO_SEROCONVERSION))) }, ) idr_covariate_options = [ ['haq'], ['uhc'], ['prop_65plus'], [], ] idr_covariate_pool = np.random.choice(idr_covariate_options, n_samples) day_inflection_options = [ '2020-06-01', '2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01', '2020-11-01', '2020-12-01', '2021-01-01', '2021-02-01', '2021-03-01', ] day_inflection_pool = np.random.choice(day_inflection_options, n_samples) day_inflection_pool = [pd.Timestamp(str(d)) for d in day_inflection_pool] inputs = { n: { 'out_dir': out_dir, 'orig_seroprevalence': seroprevalence, 'shared': shared, 'excess_mortality': excess_mortality, 'sensitivity_data': sensitivity_data, 'vaccine_coverage': vaccine_coverage, 'escape_variant_prevalence': escape_variant_prevalence, 'severity_variant_prevalence': severity_variant_prevalence, 'age_rates_root': age_rates_root, 'testing_root': testing_root, 'day_inflection': day_inflection, 'covariates': covariates, 'covariate_list': covariate_list, 'idr_covariate_list': idr_covariate_list, 'cross_variant_immunity': cross_variant_immunity, 'variant_risk_ratio': variant_risk_ratio, 'durations': durations, 'day_0': day_0, 'pred_start_date': pred_start_date, 'pred_end_date': pred_end_date, 'verbose': verbose, } for n, ( covariate_list, idr_covariate_list, seroprevalence, sensitivity_data, cross_variant_immunity, variant_risk_ratio, day_inflection, durations, ) in enumerate( zip( selected_combinations, idr_covariate_pool, seroprevalence_samples, sensitivity_data_samples, cross_variant_immunity_samples, variant_risk_ratio_samples, day_inflection_pool, durations_samples, )) } if verbose: logger.info('Storing inputs and submitting sero-sample jobs.') inputs_path = out_dir / 'pipeline_inputs.pkl' with inputs_path.open('wb') as file: pickle.dump(inputs, file, -1) pipeline_dir = out_dir / 'pipeline_outputs' shell_tools.mkdir(pipeline_dir) job_args_map = { n: [__file__, n, inputs_path, pipeline_dir] for n in range(n_samples) } if gbd: cluster.run_cluster_jobs('covid_rates_pipeline', out_dir, job_args_map, 'gbd') else: cluster.run_cluster_jobs('covid_rates_pipeline', out_dir, job_args_map, 'standard') pipeline_results = {} for n in range(n_samples): with (pipeline_dir / f'{n}_outputs.pkl').open('rb') as file: outputs = pickle.load(file) pipeline_results.update(outputs) em_data = estimates.excess_mortailty_scalars(excess_mortality) return pipeline_results, selected_combinations, \ cross_variant_immunity_samples, variant_risk_ratio_samples, \ reported_seroprevalence, reported_sensitivity_data, \ escape_variant_prevalence, severity_variant_prevalence, \ vaccine_coverage, em_data, hierarchy, population
def make_run_directory(output_root: Union[str, Path]) -> Path: """Convenience function for making a new run directory and getting its path.""" run_directory = get_run_directory(output_root) mkdir(run_directory) return run_directory
def write_seir_inputs( model_out_dir: Path, infections_draws_dir: Path, output_root: Path, hierarchy: pd.DataFrame, infections_draws: pd.DataFrame, em_scalar_data: pd.Series, deaths: pd.Series, estimated_ratios: Dict, variant_risk_ratio: List, sero_data: pd.DataFrame, sub_infections_draws: pd.DataFrame, n_draws: int, durations: List, reported_deaths: pd.Series, ): most_detailed = hierarchy['most_detailed'] == 1 location_ids = hierarchy.loc[most_detailed, 'location_id'].to_list() logger.info('Writing SEIR inputs - infections draw files.') infections_draws_cols = infections_draws.columns infections_draws = pd.concat([infections_draws, deaths], axis=1) infections_draws = infections_draws.sort_index() deaths = infections_draws['deaths'].copy() infections_draws = [ infections_draws[infections_draws_col].copy() for infections_draws_col in infections_draws_cols ] infections_draws = [ pd.concat([ infections_draw, (deaths * em_scalar_data.loc[n]).rename('deaths') ], axis=1) for n, infections_draw in enumerate(infections_draws) ] if not sub_infections_draws.empty: _, _, ifr = estimated_ratios['deaths'] _get_sub_loc_deaths = functools.partial( aggregation.get_sub_loc_deaths, n_draws=n_draws, sub_infections_draws=sub_infections_draws.copy(), ifr=ifr.copy(), durations=durations.copy(), reported_deaths=reported_deaths.copy(), ) with multiprocessing.Pool(MP_THREADS) as p: sub_deaths_scalar = list( tqdm(p.imap(_get_sub_loc_deaths, SUB_LOCATIONS), total=len(SUB_LOCATIONS), file=sys.stdout)) sub_deaths = pd.concat([sds[0] for sds in sub_deaths_scalar]).sort_index() sub_em_scalar_data = pd.concat([sds[1] for sds in sub_deaths_scalar ]).sort_index() del sub_deaths_scalar sub_infections_draws = [ pd.concat([ sub_infections_draws[infections_draws_col].copy(), sub_deaths[infections_draws_col].rename('deaths') ], axis=1) for infections_draws_col in infections_draws_cols ] infections_draws = [ pd.concat([i_d, s_i_d]).sort_index() for i_d, s_i_d in zip(infections_draws, sub_infections_draws) ] em_scalar_data = pd.concat([em_scalar_data, sub_em_scalar_data]).sort_index() _inf_writer = functools.partial( data.write_infections_draws, infections_draws_dir=infections_draws_dir, ) with multiprocessing.Pool(MP_THREADS) as p: infections_draws_paths = list( tqdm(p.imap(_inf_writer, infections_draws), total=n_draws, file=sys.stdout)) for measure, (estimated_ratio, measure_durations, ratio_prior_estimates) in estimated_ratios.items(): logger.info(f'Compiling {estimated_ratio.upper()} draws.') ratio_draws = [] for draws_path in [ result_path for result_path in model_out_dir.iterdir() if str( result_path).endswith(f'_{estimated_ratio}_draws.parquet') ]: ratio_draws.append(pd.read_parquet(draws_path)) ratio_draws = pd.concat(ratio_draws) if estimated_ratio == 'ifr': ratio_rr_draws = [] for draws_path in [ result_path for result_path in model_out_dir.iterdir() if str(result_path).endswith( f'_{estimated_ratio}_rr_draws.parquet') ]: ratio_rr_draws.append(pd.read_parquet(draws_path)) ratio_rr_draws = pd.concat(ratio_rr_draws) logger.info( f'Filling {estimated_ratio.upper()} with original model estimate where we do not have a posterior.' ) ratio_draws_cols = ratio_draws.columns ratio_prior_estimates = (ratio_prior_estimates.reset_index( ).loc[:, ['location_id', 'draw', 'date', 'ratio']]) ratio_prior_estimates = pd.pivot_table( ratio_prior_estimates, index=['location_id', 'date'], columns='draw', values='ratio', ) ratio_prior_estimates.columns = ratio_draws_cols ratio_locations = ratio_draws.reset_index()['location_id'].unique() ratio_prior_locations = ratio_prior_estimates.reset_index( )['location_id'].unique() missing_locations = [ l for l in ratio_prior_locations if l not in ratio_locations ] missing_locations = [l for l in missing_locations if l in location_ids] ratio_prior_estimates = ratio_prior_estimates.loc[missing_locations] if len(ratio_prior_estimates) > 0: logger.info( f"Appending prior estimates for the following locations: " f"{', '.join(ratio_prior_estimates.reset_index()['location_id'].astype(str).unique())}" ) ratio_draws = ratio_draws.append(ratio_prior_estimates) logger.info( f'Writing SEIR inputs - {estimated_ratio.upper()} draw files.') ratio_draws = ratio_draws.sort_index() if estimated_ratio == 'ifr': ifr_lr_rr = (ratio_rr_draws.reset_index(). loc[:, ['location_id', 'draw', 'date', 'ifr_lr_rr']]) ifr_lr_rr = pd.pivot_table( ifr_lr_rr, index=['location_id', 'date'], columns='draw', values='ifr_lr_rr', ) ifr_lr_rr.columns = ratio_draws_cols ifr_hr_rr = (ratio_rr_draws.reset_index(). loc[:, ['location_id', 'draw', 'date', 'ifr_hr_rr']]) ifr_hr_rr = pd.pivot_table( ifr_hr_rr, index=['location_id', 'date'], columns='draw', values='ifr_hr_rr', ) ifr_hr_rr.columns = ratio_draws_cols ratio_draws = [( ratio_draws[ratio_draws_col].copy(), ifr_lr_rr[ratio_draws_col].copy(), ifr_hr_rr[ratio_draws_col].copy(), ) for ratio_draws_col in ratio_draws_cols] else: ratio_draws = [[ratio_draws[ratio_draws_col].copy()] for ratio_draws_col in ratio_draws_cols] ratio_draws_dir = output_root / f'{estimated_ratio}_draws' shell_tools.mkdir(ratio_draws_dir) _ratio_writer = functools.partial( data.write_ratio_draws, estimated_ratio=estimated_ratio, durations=measure_durations, variant_risk_ratio=variant_risk_ratio, ratio_draws_dir=ratio_draws_dir, ) with multiprocessing.Pool(MP_THREADS) as p: ratio_draws_paths = list( tqdm(p.imap(_ratio_writer, ratio_draws), total=n_draws, file=sys.stdout)) logger.info('Writing serology data and EM scaling factor data.') em_path = output_root / 'em_data.csv' em_scalar_data = ( infections_draws[0].reset_index().loc[:, ['location_id', 'date']].merge( em_scalar_data.reset_index(), how='left')) em_scalar_data['em_scalar'] = em_scalar_data['em_scalar'].fillna(1) em_scalar_data.to_csv(em_path, index=False) # em_scalar_data['date'] = em_scalar_data['date'].astype(str) # em_path = output_root / 'em_data.parquet' # em_scalar_data.to_parquet(em_path, engine='fastparquet', compression='gzip') sero_data['included'] = 1 - sero_data['is_outlier'] sero_data = sero_data.rename(columns={'sero_sample_mean': 'value'}) sero_data = sero_data.loc[:, ['included', 'value']] sero_path = output_root / 'sero_data.csv' sero_data.reset_index().to_csv(sero_path, index=False)
def test_mkdir_no_args(mode: int, tmp_path: Path): tmp_path.rmdir() assert not tmp_path.exists() mkdir(tmp_path, mode=mode) assert tmp_path.exists() assert oct(tmp_path.stat().st_mode)[-3:] == oct(mode)[-3:]
def make_deaths(app_metadata: cli_tools.Metadata, input_root: Path, output_root: Path, holdout_days: int, dow_holdouts: int, n_draws: int): logger.debug(f"Setting up output directories in {str(output_root)}.") model_dir = output_root / 'models' spline_settings_dir = output_root / 'spline_settings' plot_dir = output_root / 'plots' shell_tools.mkdir(model_dir) shell_tools.mkdir(spline_settings_dir) shell_tools.mkdir(plot_dir) logger.debug("Loading and cleaning data.") hierarchy = data.load_most_detailed_locations(input_root) agg_hierarchy = data.load_aggregate_locations(input_root) full_data = data.load_full_data(input_root) full_data, manipulation_metadata = data.evil_doings(full_data) app_metadata.update({'data_manipulation': manipulation_metadata}) death_data = data.get_death_data(full_data) max_death_date = (death_data.groupby('location_id')['Date'].max().rename( 'max_death_date').reset_index()) case_data = data.get_shifted_data(full_data, 'Confirmed', 'Confirmed case rate') case_data = case_data.merge(max_death_date) case_data = case_data.loc[ case_data['True date'] <= case_data['max_death_date']] del case_data['max_death_date'] hosp_data = data.get_shifted_data(full_data, 'Hospitalizations', 'Hospitalization rate') hosp_data = hosp_data.merge(max_death_date) hosp_data = hosp_data.loc[ hosp_data['True date'] <= hosp_data['max_death_date']] del hosp_data['max_death_date'] del max_death_date pop_data = data.get_population_data(input_root, hierarchy) logger.debug(f"Dropping {holdout_days} days from the end of the data.") case_data = data.holdout_days(case_data, holdout_days) hosp_data = data.holdout_days(hosp_data, holdout_days) death_data = data.holdout_days(death_data, holdout_days) logger.debug("Filtering data by location.") case_data, missing_cases = data.filter_data_by_location( case_data, hierarchy, 'cases') hosp_data, missing_hosp = data.filter_data_by_location( hosp_data, hierarchy, 'hospitalizations') death_data, missing_deaths = data.filter_data_by_location( death_data, hierarchy, 'deaths') pop_data, missing_pop = data.filter_data_by_location( pop_data, hierarchy, 'population') logger.debug("Combine datasets.") model_data = data.combine_data(case_data, hosp_data, death_data, pop_data, hierarchy) model_data = model_data.sort_values(['location_id', 'Date']).reset_index(drop=True) model_data = data.drop_leading_zeros( model_data, ['Death rate']) # , 'Confirmed case rate', 'Hospitalization rate' logger.debug("Create aggregates for modeling.") agg_locations = [ aggregate.Location(lid, lname) for lid, lname in zip( agg_hierarchy['location_id'], agg_hierarchy['location_name']) ] agg_model_data = aggregate.compute_location_aggregates_data( model_data, hierarchy, agg_locations, ['Confirmed case rate', 'Hospitalization rate', 'Death rate']) model_data = model_data.append(agg_model_data) model_data = model_data.sort_values(['location_id', 'Date']).reset_index(drop=True) logger.debug("Filter cases/hospitalizations based on threshold.") model_data, dropped_locations, no_cases_locs, no_hosp_locs = data.filter_to_epi_threshold( hierarchy, model_data, death_threshold=5, epi_threshold=10) app_metadata.update({'dropped_locations': dropped_locations}) logger.debug("Preparing model settings.") model_settings = {} s1_settings = { 'dep_var': 'Death rate', 'model_dir': str(model_dir), 'indep_vars': [] } cfr_settings = {'spline_var': 'Confirmed case rate', 'model_type': 'CFR'} cfr_settings.update(s1_settings) model_settings.update({'CFR': cfr_settings}) hfr_settings = {'spline_var': 'Hospitalization rate', 'model_type': 'HFR'} hfr_settings.update(s1_settings) model_settings.update({'HFR': hfr_settings}) smoother_settings = { 'obs_var': 'Death rate', 'pred_vars': ['Predicted death rate (CFR)', 'Predicted death rate (HFR)'], 'spline_vars': ['Confirmed case rate', 'Hospitalization rate'], 'spline_settings_dir': str(spline_settings_dir) } model_settings.update({'smoother': smoother_settings}) model_settings['no_cases_locs'] = no_cases_locs model_settings['no_hosp_locs'] = no_hosp_locs logger.debug("Launching models by location.") working_dir = output_root / 'model_working_dir' shell_tools.mkdir(working_dir) data_path = Path(working_dir) / 'model_data.pkl' with data_path.open('wb') as data_file: pickle.dump(model_data, data_file, -1) results_path = Path(working_dir) / 'model_outputs' shell_tools.mkdir(results_path) model_settings['results_dir'] = str(results_path) settings_path = Path(working_dir) / 'settings.yaml' with settings_path.open('w') as settings_file: yaml.dump(model_settings, settings_file) job_args_map = { location_id: [ models.__file__, location_id, data_path, settings_path, dow_holdouts, str(plot_dir), n_draws, cluster.OMP_NUM_THREADS ] for location_id in model_data['location_id'].unique() if location_id not in PARENT_MODEL_LOCATIONS } cluster.run_cluster_jobs('covid_death_models', output_root, job_args_map) logger.debug("Compiling results.") results = [] for result_path in results_path.iterdir(): with result_path.open('rb') as result_file: results.append(pickle.load(result_file)) post_model_data = pd.concat([r.model_data for r in results]).reset_index(drop=True) noisy_draws = pd.concat([r.noisy_draws for r in results]).reset_index(drop=True) smooth_draws = pd.concat([r.smooth_draws for r in results]).reset_index(drop=True) failed_model_locations = ( model_data.loc[~model_data['location_id']. isin(post_model_data['location_id'].to_list()), 'location_id'].unique().tolist()) failed_model_locations = [ l for l in failed_model_locations if l not in PARENT_MODEL_LOCATIONS ] failed_model_locations = [ l for l in failed_model_locations if l in hierarchy['location_id'].to_list() ] app_metadata.update({'failed_model_locations': failed_model_locations}) model_data = post_model_data.append( model_data.loc[model_data['location_id'].isin(PARENT_MODEL_LOCATIONS)]) obs_var = smoother_settings['obs_var'] spline_vars = smoother_settings['spline_vars'] logger.debug("Capturing location-dates with NaNs and dropping them.") nan_rows = smooth_draws.isnull().any(axis=1) smooth_draws_nans = smooth_draws.loc[nan_rows].reset_index(drop=True) smooth_draws = smooth_draws.loc[~nan_rows].reset_index(drop=True) nan_min = smooth_draws_nans.groupby('location_id')['date'].min() val_max = smooth_draws.groupby('location_id')['date'].max() date_diffs = (nan_min - val_max).apply(lambda x: x.days) date_diffs = date_diffs.loc[date_diffs.notnull()] app_metadata.update({'nan_locations': date_diffs.index.to_list()}) if (date_diffs < 0).any(): date_diffs.to_csv(output_root / 'problem_location_report.csv', index=False) raise ValueError( 'Dropping NaNs in middle of time series (see problem_location_report.csv)' ) logger.debug("Fill specified model locations with parent and plot them.") smooth_draws, model_data = data.apply_parents(PARENT_MODEL_LOCATIONS, hierarchy, smooth_draws, model_data, pop_data) summarize.summarize_and_plot( smooth_draws.loc[smooth_draws['location_id'].isin( PARENT_MODEL_LOCATIONS)].rename(columns={'date': 'Date'}), model_data.loc[model_data['location_id'].isin(PARENT_MODEL_LOCATIONS)], str(plot_dir), obs_var=obs_var, spline_vars=spline_vars, pop_data=pop_data) app_metadata.update({'parent_model_locations': PARENT_MODEL_LOCATIONS}) logger.debug("Make post-model aggregates and plot them.") agg_locations = [aggregate.Location(1, 'Global')] + agg_locations agg_model_data = aggregate.compute_location_aggregates_data( model_data, hierarchy, agg_locations) agg_model_data['location_id'] = -agg_model_data['location_id'] agg_model_data['location_name'] = agg_model_data[ 'location_name'] + ' (model aggregate)' agg_draw_df = aggregate.compute_location_aggregates_draws( smooth_draws.rename(columns={'date': 'Date'}), hierarchy, agg_locations) agg_draw_df['location_id'] = -agg_draw_df['location_id'] summarize.summarize_and_plot(agg_draw_df, agg_model_data, str(plot_dir), obs_var=obs_var, spline_vars=spline_vars) logger.debug("Compiling plots.") plot_hierarchy = aggregate.get_sorted_hierarchy_w_aggs( hierarchy, agg_hierarchy) possible_pdfs = ['-1.pdf' ] + [f'{l}.pdf' for l in plot_hierarchy.location_id] existing_pdfs = [ str(x).split('/')[-1] for x in plot_dir.iterdir() if x.is_file() ] pdfs = [ f'{plot_dir}/{pdf}' for pdf in possible_pdfs if pdf in existing_pdfs ] pdf_merger.pdf_merger(pdfs=pdfs, outfile=str(output_root / 'model_results.pdf')) logger.debug(f"Writing output data in {str(output_root)}.") model_data = model_data.rename(columns={ 'Date': 'date' }).set_index(['location_id', 'date']) noisy_draws = noisy_draws.set_index(['location_id', 'date']) noisy_draws['observed'] = model_data['Death rate'].notnull().astype(int) smooth_draws = smooth_draws.set_index(['location_id', 'date']) smooth_draws['observed'] = model_data['Death rate'].notnull().astype(int) model_data.rename(columns={ 'date': 'Date' }).reset_index().to_csv(output_root / 'model_data.csv', index=False) noisy_draws.reset_index().to_csv(output_root / 'model_results.csv', index=False) smooth_draws.reset_index().to_csv(output_root / 'model_results_refit.csv', index=False) smooth_draws_nans.to_csv(output_root / 'model_results_refit_nans.csv', index=False)