def construct_args_burdenator_cleanup(parser, cli_args=None): """Creates arguments from parser for rearranging the draw files at the end of the burdenator run""" if cli_args is None: cli_args = sys.argv[1:] args = parser.parse_args(cli_args) args.tool_name = 'burdenator' # Create log directory top_out_dir = args.out_dir args.cache_dir = '{}/cache'.format(args.out_dir) args.log_dir = os.path.join(top_out_dir, 'log_cleanup', str(args.year_id), str(args.measure_id)) log_filename = "{}_{}_{}.log".format(args.measure_id, args.location_id, args.year_id) makedirs_safely(args.log_dir) log_level = logging.DEBUG if args.verbose else logging.INFO args.logger = create_logger_in_memory("dalynator", log_level, args.log_dir + "/" + log_filename) # Get cod/epi env directories args.cod_dir = (args.input_data_root + "/codcorrect/" + str(args.cod_version) + "/draws") args.epi_dir = get_folder_structure( os.path.join(args.input_data_root, 'como', str(args.epi_version))) return args
def get_args_pct_change(parser, cli_args=None): """Creates arguments from parser for pct change calculation""" if cli_args is None: cli_args = sys.argv[1:] args = parser.parse_args(cli_args) args.gbd_round, args.gbd_round_id = ac.populate_gbd_round_args( args.gbd_round, args.gbd_round_id) args.log_dir = os.path.join(args.out_dir, 'log_pct_change', str(args.location_id)) makedirs_safely(args.log_dir) logfn = "FILEPATH".format(args.start_year, args.end_year) log_level = logging.DEBUG if args.verbose else logging.INFO args.logger = create_logger_in_memory("dalynator", log_level, "FILEPATH".format(args.log_dir, logfn), ['aggregator.aggregators', 'jobmon']) # Get cod/epi env directories if args.codcorrect_version == 'best': args.codcorrect_version = ac.best_version( 'codcorrect', args.gbd_round_id, args.decomp_step) if args.fauxcorrect_version == 'best': args.fauxcorrect_version = ac.best_version( 'fauxcorrect', args.gbd_round_id, args.decomp_step) if args.epi_version is None: args.epi_version = ac.best_version('como', args.gbd_round_id, args.decomp_step) args.epi_dir = get_como_folder_structure(os.path.join( args.input_data_root, 'como', str(args.epi_version))) cod_object = to.cod_or_faux_correct( args.input_data_root, codcorrect_version=args.codcorrect_version, fauxcorrect_version=args.fauxcorrect_version) args.cod_dir = cod_object.abs_path_to_draws args.cod_pattern = cod_object.file_pattern return args
def get_args_burdenator_loc_agg(parser, cli_args=None): """Creates arguments from parser for burdenator location aggregation""" if cli_args is None: cli_args = sys.argv[1:] args = parser.parse_args(cli_args) args.gbd_round, args.gbd_round_id = ac.populate_gbd_round_args( args.gbd_round, args.gbd_round_id) # Create log directory top_out_dir = args.data_root args.cache_dir = 'FILEPATH'.format(args.data_root) args.log_dir = os.path.join(top_out_dir, 'log_loc_agg', str(args.year_id), str(args.measure_id)) log_filename = "FILEPATH".format( args.measure_id, args.rei_id, args.year_id, args.sex_id) makedirs_safely(args.log_dir) log_level = logging.DEBUG if args.verbose else logging.INFO args.logger = create_logger_in_memory( "dalynator", log_level, args.log_dir + "/" + log_filename, ['aggregator.aggregators', 'jobmon']) return args
def write_csv(df, filename, write_columns_order=None, write_out_star_ids=False, dual_upload=False): """Assumes we are writing a CSV for the purposes of eventually uploading to a database, sorts df accordingly and writes to filename""" df = separate_rejected_data_to_csv(df, filename) # Prioritize sorting for the database. If that's not possible, fallback # to write_columns_order or unsorted try: write_df = sort_for_db(df) except ValueError: if write_columns_order: write_df = df.sort_values(write_columns_order) else: write_df = df # Write to file, make sure to ignore star_id if stars aren't wanted if write_columns_order: cols = remove_unwanted_star_id_column(write_columns_order, write_out_star_ids) else: cols = remove_unwanted_star_id_column(df.columns.tolist(), write_out_star_ids) write_df.to_csv(filename, columns=cols, index=False) if dual_upload: pub_up_filename = sub_pub_for_cc(filename) pub_up_dir = os.path.dirname(pub_up_filename) makedirs_safely(pub_up_dir) write_df.to_csv(pub_up_filename, columns=cols, index=False)
def __init__(self, location_set_id, year_id, rei_id, sex_id, measure_id, gbd_round_id, n_draws, data_root, region_locs, write_out_star_ids): self.location_set_id = location_set_id self.year_id = year_id self.rei_id = rei_id self.sex_id = sex_id self.measure_id = measure_id self.gbd_round_id = gbd_round_id self.n_draws = n_draws self.data_root = data_root self.region_locs = region_locs self.data_container = DataContainer( {'location_set_id': self.location_set_id, 'year_id': self.year_id, 'sex_id': self.sex_id}, n_draws=self.n_draws, gbd_round_id=self.gbd_round_id, cache_dir=os.path.join(self.data_root, 'cache')) self.loctree = self.data_container[ 'location_hierarchy_{}'.format(self.location_set_id)] self.in_dir = os.path.join(self.data_root, 'draws') self.out_dir = os.path.join(self.data_root, 'loc_agg_draws/burden') mkds.makedirs_safely(self.out_dir) self.write_out_star_ids = write_out_star_ids # Remove old aggregates in case jobs failed in the middle aggregates = [n.id for n in self.loctree.nodes if n not in self.loctree.leaves()] for loc in aggregates: filename = ('{o}/{lo}/{me}/{m}_{y}_{loc}_{r}_{s}.h5' .format(o=self.out_dir, lo=loc, me=self.measure_id, m=self.measure_id, y=self.year_id, loc=loc, r=self.rei_id, s=self.sex_id)) logger.debug("Deleting potentially pre-existing loc-agg file" "{e}: '{f}'".format(e=os.path.exists(filename), f=filename)) with contextlib.suppress(FileNotFoundError): os.remove(filename) self.index_cols = ['measure_id', 'metric_id', 'sex_id', 'cause_id', 'rei_id', 'year_id', 'age_group_id'] self.value_cols = ['draw_{}'.format(i) for i in range(self.n_draws)] self.draw_filters = {'metric_id': gbd.metrics.NUMBER, 'rei_id': self.rei_id, 'sex_id': self.sex_id, 'measure_id': self.measure_id, 'year_id': self.year_id} self.operator = self.get_operator() self.draw_source, self.draw_sink = self.get_draw_source_sink()
def construct_directories(out_dir, log_dir, cache_dir, resume): """ Create the output directory and the run_all logger. Used by both burdenator and dalynator. Check that both directories are empty. If they are not-empty then only continue if we are in resume mode. :param out_dir: The root directory WITH the version number :param log_dir: The path to the log directory :param cache_dir: The path to the cache directory :param resume: True if this is running in resume mode """ if os.path.isdir(out_dir): if os.listdir(out_dir) and not resume: raise ValueError( "Output directory {} contains files and NOT running in " "resume mode".format(out_dir)) if os.path.isdir(log_dir): if os.listdir(log_dir) and not resume: raise ValueError("Log directory {} contains files and NOT " "running in resume mode".format(log_dir)) makedirs_safely(out_dir) makedirs_safely(log_dir) makedirs_safely(cache_dir) if resume: # If resuming then rotate (rename) the main log, daly_run_all.log rotate_logs(out_dir, log_dir) stderr_dir = os.path.join(out_dir, "stderr") makedirs_safely(stderr_dir)
def _prepare_with_external_side_effects(self): """ Creates output directories, loggers. Returns: Nothing """ makedirs_safely(self.output_draws_dir) makedirs_safely(self.log_dir) log_level = logging.DEBUG if self.verbose else logging.INFO _ = create_logger_in_memory( "dalynator", log_level, self.log_dir + "/daly_{}_{}.log".format(self.location_id, self.year_id))
def get_args_pct_change(parser, cli_args=None): """Creates arguments from parser for pct change calculation""" if cli_args is None: cli_args = sys.argv[1:] args = parser.parse_args(cli_args) args.log_dir = os.path.join(args.out_dir, 'log_pct_change', str(args.location_id)) makedirs_safely(args.log_dir) logfn = "pc_{}_{}.log".format(args.start_year, args.end_year) args.logger = create_logger_in_memory("dalynator", logging.DEBUG, "{}/{}".format(args.log_dir, logfn)) return args
def get_summ_filename(draw_dir, risk_type, location_id, year_id, measure_id): """Return the summary-file (i.e. csv file) name for the given argset, creating the directory if necessary""" if risk_type == RISK_REI_TYPE: file_label = 'risk' elif risk_type == ETI_REI_TYPE: file_label = 'eti' fn = ("{dd}/upload/{m}/single_year/" "upload_{fl}_{l}_{y}.csv".format(dd=draw_dir, fl=file_label, l=location_id, m=measure_id, y=year_id)) makedirs_safely(os.path.dirname(fn)) return fn
def df_to_csv(this_df, index_cols, this_out_dir, out_file_basename, write_columns_order): summaries = ComputeSummaries( this_df, write_columns_order, index_cols ) new_df = summaries.get_data_frame() makedirs_safely(this_out_dir) filename = os.path.join(this_out_dir, out_file_basename) logger.info("Summary file output path {}".format(filename)) write_csv(new_df, filename, write_columns_order=write_columns_order)
def rotate_logs(out_dir, log_dir): """Move the existing daly_run_all.log and the stderr directories to be timestamped versions. Useful during resume, so that we don't keep appending to the same log.""" t = time.localtime() time_stamp = "{}-{:02d}-{:02d}_{:02d}:{:02d}:{:02d}".\ format(t.tm_year, t.tm_mon, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec) main_log = os.path.join(log_dir, "daly_run_all.log") if os.path.exists(main_log): os.rename(main_log, "{}.{}".format(main_log, time_stamp)) stderr_dir = os.path.join(out_dir, "stderr") if os.path.exists(stderr_dir): os.rename(stderr_dir, "{}.{}".format(stderr_dir, time_stamp)) # And re-recreate the normal stderr directory to be sure makedirs_safely(stderr_dir)
def get_args_and_create_dirs(parser, cli_args=None): """Parses the command line using the parser and creates output directory and logger. Called by run_pipeline_*. Not used by run_all.""" if cli_args is None: cli_args = sys.argv[1:] args = parser.parse_args(cli_args) # resolve defaults for cod and epi versions args.gbd_round, args.gbd_round_id = ac.populate_gbd_round_args( args.gbd_round, args.gbd_round_id) if args.cod_version is None: args.cod_version = ac.best_version('cod', args.gbd_round_id) if args.epi_version is None: args.epi_version = ac.best_version('como', args.gbd_round_id) # Store all years for each location in one directory top_out_dir = args.out_dir args.cache_dir = '{}/cache'.format(args.out_dir) args.log_dir = os.path.join(top_out_dir, 'log', str(args.location_id)) args.out_dir = os.path.join(top_out_dir, 'draws', str(args.location_id)) makedirs_safely(args.out_dir) makedirs_safely(args.log_dir) log_level = logging.DEBUG if args.verbose else logging.INFO args.logger = create_logger_in_memory( "dalynator", log_level, args.log_dir + "/daly_{}_{}.log".format(args.location_id, args.year_id)) args.cod_dir = "{}/codcorrect/{}/draws/".format(args.input_data_root, args.cod_version) if hasattr(args, 'daly_version'): args.daly_dir = "{}/dalynator/{}/draws/".format( args.input_data_root, args.daly_version) else: args.daly_dir = None args.epi_dir = get_como_folder_structure( os.path.join(args.input_data_root, 'como', str(args.epi_version))) if hasattr(args, 'paf_version'): args.paf_dir = "{}/pafs/{}".format(args.input_data_root, args.paf_version) else: args.paf_dir = None return args
def construct_args_burdenator_cleanup(parser, cli_args=None): """Creates arguments from parser for rearranging the draw files at the end of the burdenator run""" if cli_args is None: cli_args = sys.argv[1:] args = parser.parse_args(cli_args) args.tool_name = 'burdenator' args.gbd_round, args.gbd_round_id = ac.populate_gbd_round_args( args.gbd_round, args.gbd_round_id) # Create log directory top_out_dir = args.out_dir args.cache_dir = 'FILEPATH'.format(args.out_dir) args.log_dir = os.path.join(top_out_dir, 'log_cleanup', str(args.year_id), str(args.measure_id)) log_filename = "FILEPATH".format( args.measure_id, args.location_id, args.year_id) makedirs_safely(args.log_dir) log_level = logging.DEBUG if args.verbose else logging.INFO args.logger = create_logger_in_memory( "dalynator", log_level, args.log_dir + "/" + log_filename, ['aggregator.aggregators', 'jobmon']) # Get cod/epi env directories if args.codcorrect_version == 'best': args.codcorrect_version = ac.best_version( 'codcorrect', args.gbd_round_id, args.decomp_step) if args.fauxcorrect_version == 'best': args.fauxcorrect_version = ac.best_version( 'fauxcorrect', args.gbd_round_id, args.decomp_step) if args.epi_version is None: args.epi_version = ac.best_version('como', args.gbd_round_id, args.decomp_step) args.epi_dir = get_como_folder_structure(os.path.join( args.input_data_root, 'como', str(args.epi_version))) cod_object = to.cod_or_faux_correct( args.input_data_root, codcorrect_version=args.codcorrect_version, fauxcorrect_version=args.fauxcorrect_version) args.cod_dir = cod_object.abs_path_to_draws args.cod_pattern = cod_object.file_pattern return args
def create_run_all_directories(args): """Create the output directory and the run_all logger. Used by both burdenator and dalynator.""" add_run_all_directories_to_args(args) # Check that both directories are empty. If they are not-empty then only continue if we are in resume mode if os.path.isdir(args.out_dir): if os.listdir(args.out_dir) and not args.resume: raise ValueError( "Output directory {} contains files and NOT running in resume mode" .format(args.out_dir)) if os.path.isdir(args.log_dir): if os.listdir(args.log_dir) and not args.resume: raise ValueError( "Log directory {} contains files and NOT running in resume mode" .format(args.log_dir)) makedirs_safely(args.log_dir) makedirs_safely(args.out_dir) makedirs_safely(args.cache_dir) if args.resume: # If resuming then rotate (rename) the main log, daly_run_all.log rotate_logs(args.out_dir, args.log_dir) log_level = logging.DEBUG if args.verbose else logging.INFO logger = create_logger_in_memory("dalynator", log_level, args.log_dir + "/daly_run_all.log")
def get_args_burdenator_loc_agg(parser, cli_args=None): """Creates arguments from parser for burdenator location aggregation""" if cli_args is None: cli_args = sys.argv[1:] args = parser.parse_args(cli_args) # Create log directory top_out_dir = args.out_dir args.cache_dir = '{}/cache'.format(args.out_dir) args.log_dir = os.path.join(top_out_dir, 'log_loc_agg', str(args.year_id), str(args.measure_id)) log_filename = "{}_{}_{}_{}.log".format(args.measure_id, args.rei_id, args.year_id, args.sex_id) makedirs_safely(args.log_dir) log_level = logging.DEBUG if args.verbose else logging.INFO args.logger = create_logger_in_memory("dalynator", log_level, args.log_dir + "/" + log_filename) return args
def construct_args_dalynator_upload(parser, cli_args=None): """Creates arguments from parser for uploading dalynator data""" if cli_args is None: cli_args = sys.argv[1:] args = parser.parse_args(cli_args) args.tool_name = 'dalynator' # Create log directory top_out_dir = args.out_dir args.cache_dir = '{}/cache'.format(args.out_dir) args.log_dir = os.path.join(top_out_dir, 'log_upload', args.table_type, str(args.measure_id)) log_filename = "upload_{}_{}.log".format(args.table_type, args.measure_id) makedirs_safely(args.log_dir) log_level = logging.DEBUG if args.verbose else logging.INFO args.logger = create_logger_in_memory("dalynator", log_level, args.log_dir + "/" + log_filename) return args
def construct_args_upload(parser, cli_args=None): """Creates arguments from parser for uploading data""" if cli_args is None: cli_args = sys.argv[1:] args = parser.parse_args(cli_args) # Create log directory top_out_dir = args.out_dir args.cache_dir = 'FILEPATH'.format(args.out_dir) args.log_dir = os.path.join( top_out_dir, 'log_upload', args.table_type) log_filename = "FILEPATH".format( args.gbd_process_version_id, args.table_type) makedirs_safely(args.log_dir) log_level = logging.DEBUG if args.verbose else logging.INFO args.logger = create_logger_in_memory( "dalynator", log_level, args.log_dir + "/" + log_filename, ['aggregator.aggregators', 'jobmon']) return args
def get_args_and_create_dirs(parser, cli_args=None): """Parses the command line using the parser and creates output directory and logger. Called by run_pipeline_*. Not used by run_all. """ if cli_args is None: cli_args = sys.argv[1:] args = parser.parse_args(cli_args) cod_object = to.cod_or_faux_correct(args.input_data_root, args.codcorrect_version, args.fauxcorrect_version) # resolve defaults for cod and epi versions if args.codcorrect_version == 'best': args.codcorrect_version = ac.best_version( 'codcorrect', args.gbd_round_id, args.decomp_step) if args.fauxcorrect_version == 'best': args.fauxcorrect_version = ac.best_version( 'fauxcorrect', args.gbd_round_id, args.decomp_step) if args.epi_version is None: args.epi_version = ac.best_version('como', args.gbd_round_id, args.decomp_step) # Store all years for each location in one directory top_out_dir = args.out_dir args.cache_dir = 'FILEPATH'.format(args.out_dir) makedirs_safely(os.path.join(top_out_dir, 'log_most_detailed')) args.log_dir = os.path.join(top_out_dir, 'log_most_detailed', str(args.location_id)) args.out_dir = os.path.join(top_out_dir, 'draws', str(args.location_id)) makedirs_safely(args.out_dir) makedirs_safely(args.log_dir) log_level = logging.DEBUG if args.verbose else logging.INFO args.logger = create_logger_in_memory( "dalynator", log_level, args.log_dir + "FILEPATH".format(args.location_id, args.year_id), ['aggregator.aggregators', 'jobmon']) args.cod_dir = cod_object.abs_path_to_draws args.cod_pattern = cod_object.file_pattern # this had daly_version before but I think we still want this # differentiated file path if hasattr(args, 'tool_name') and args.tool_name == "dalynator": args.daly_dir = "FILEPATH".format(args.input_data_root, args.output_version) else: args.daly_dir = None # our customers want the flag to be named "epi" not como" args.epi_dir = get_como_folder_structure(os.path.join( args.input_data_root, 'como', str(args.epi_version))) if hasattr(args, 'paf_version'): # PAF directory structure has no "draws" sub-folder args.paf_dir = "FILEPATH".format(args.input_data_root, args.paf_version) else: args.paf_dir = None return args