def create_compare_version(self): pv = GBDProcessVersion(self.gbd_process_version_id) pv._update_status(gbd_process_version_status.ACTIVE) description = f'COMO v{self.como_version_id}' cv = CompareVersion.add_new_version( gbd_round_id=self.gbd_round_id, decomp_step=decomp_step_from_decomp_step_id(self.decomp_step_id), compare_version_description=description) df = bleeding_edge(cv.compare_version_id) cv.add_process_version(df.gbd_process_version_id.tolist()) cv._update_status(gbd_process_version_status.ACTIVE) return cv
def create_gbd_process_version(self, code_version): gbd_metadata = {gbd_metadata_type.COMO: self.como_version_id} gbd_note = f'COMO v{self.como_version_id}' pv = GBDProcessVersion.add_new_version( gbd_process_id=gbd_process.EPI, gbd_process_version_note=gbd_note, code_version=code_version, gbd_round_id=self.gbd_round_id, decomp_step=decomp_step_from_decomp_step_id(self.decomp_step_id), metadata=gbd_metadata) self.gbd_process_version_id = pv.gbd_process_version_id return pv
def create_compare_version(self, current_compare_version_id, description): cur_cv = CompareVersion(current_compare_version_id) cv = CompareVersion.add_new_version( gbd_round_id=self.gbd_round_id, compare_version_description=description) for pv in cur_cv.gbd_process_version_ids(): gbd_process_version = GBDProcessVersion(pv) if gbd_process_version.gbd_process_id != 1: cv.add_process_version(pv) cv.add_process_version(self.gbd_process_version_id) cv.mark_best() cv.unmark_best()
def run_dalynator_upload(out_dir, gbd_process_version_id, location_ids, measure_id, table_type, upload_to_test): """ Upload summary files to GBD Outputs summary tables Args: out_dir (str): the root directory for this dalynator run gbd_process_version_id (int): GBD Process Version to upload to location_ids (int): location ids used in this dalynator run measure_id (int): measure_id of the upload table_type (str): type of DB table to upload into (single or multi year) upload_to_test (bool): determines whether to upload to the test db """ start_time = time.time() logger.info("START pipeline dalynator upload at {}".format(start_time)) # Get process version if upload_to_test: upload_env = DBEnv.DEV else: upload_env = DBEnv.PROD pv = GBDProcessVersion(gbd_process_version_id, env=upload_env) # Read in all summary files and save summary_data = [] total_rows = 0 for location_id in location_ids: file_pattern = "FILEPATH.csv".format( o=out_dir, l=location_id, m=measure_id, tt=table_type) for d in import_data_list_from_file_pattern(file_pattern): total_rows += len(d) summary_data.append(d) if total_rows > 20000000: # Concatenate results and save for upload concatenate_and_upload( summary_data, table_type, pv, out_dir) # Reset summary data and row count summary_data = [] total_rows = 0 if total_rows > 0: # Concatenate results and save for upload concatenate_and_upload( summary_data, table_type, pv, out_dir) # Reset summary data and row count summary_data = [] total_rows = 0 # Upload data table_name = "output_summary_{}_v{}".format( table_type, pv.gbd_process_version_id) logger.info("Uploading data, time = {}".format(time.time())) pv.upload_to_table(table_name) logger.info("End uploading data, time = {}".format(time.time())) # End log end_time = time.time() elapsed = end_time - start_time logger.info("DONE cleanup pipeline at {}, elapsed seconds= {}".format( end_time, elapsed)) logger.info("{}".format(SUCCESS_LOG_MESSAGE))
os.chmod(file, 0o775) sesh = get_session(conn_def='gbd', connectable=True) infiler = Infiles(table=table_name, schema='gbd', session=sesh) file_list = sorted(os.listdir(data_dir)) for f in file_list: print(f) infiler.infile(os.path.join(data_dir, f), with_replace=False, commit=True) # parse args sev_version_id = int(sys.argv[1]) paf_version_id = int(sys.argv[2]) gbd_round_id = int(sys.argv[3]) decomp_step = sys.argv[4] # set up process version id git_hash = subprocess.check_output(['git', '--git-dir=FILEPATH/sev/.git', '--work-tree=FILEPATH', 'rev-parse', 'HEAD']).strip() process_version_id = GBDProcessVersion.add_new_version( gbd_process_id=14, gbd_process_version_note='SEV v{}'.format(sev_version_id), code_version=git_hash, gbd_round_id=gbd_round_id, decomp_step=decomp_step, metadata={2: paf_version_id}) # upload for table_type in ['single_year', 'multi_year']: upload(sev_version_id, process_version_id, table_type) process_version_id.mark_best()
def run_burdenator_upload(out_dir, gbd_process_version_id, location_ids, measure_id, table_type, upload_to_test, raise_on_error=False): """ Upload summary files to GBD Outputs risk tables Args: out_dir (str): the root directory for this burdenator run gbd_process_version_id (int): GBD Process Version to upload to location_id (List[int]): location ids used in this burdenator run measure_id (int): measure_id of the upload table_type (str): type of DB table to upload into (single or multi year) upload_to_test (bool): determines whether to upload to the test db raise_on_error (bool, False): While infiling, if an exception is caught, raise. If False, will just log and continue to next file """ start_time = time.time() logger.info("START pipeline burdenator upload at {}".format(start_time)) # Get process version if upload_to_test: upload_env = DBEnv.DEV else: upload_env = DBEnv.PROD pv = GBDProcessVersion(gbd_process_version_id, env=upload_env) # Get the prefix for file names if pv.gbd_process_id == RISK_GBD_PROCESS_ID: prefix = 'upload_risk_' elif pv.gbd_process_id == ETI_GBD_PROCESS_ID: prefix = 'upload_eti_' # call load data infile on summary files in primary key order # (year/location) file_df = make_file_df(out_dir, measure_id, table_type, prefix, location_ids) # Upload data if pv.gbd_process_id == RISK_GBD_PROCESS_ID: table_name = "output_risk_{}_v{}".format(table_type, pv.gbd_process_version_id) elif pv.gbd_process_id == ETI_GBD_PROCESS_ID: table_name = "output_etiology_{}_v{}".format(table_type, pv.gbd_process_version_id) logger.info("Uploading data to table {}, time = {}".format( table_name, time.time())) upload_to_table(file_df.raw.tolist(), table_name, upload_env, raise_on_error) logger.info("End uploading data, time = {}".format(time.time())) # End log end_time = time.time() elapsed = end_time - start_time logger.info("DONE upload pipeline at {}, elapsed seconds= {}".format( end_time, elapsed)) logger.info("{}".format(SUCCESS_LOG_MESSAGE))
def run_upload(out_dir, gbd_process_version_id, location_ids, table_type, upload_to_test, storage_engine='INNODB'): """ Upload summary files to GBD Outputs risk tables Args: out_dir (str): the root directory for this run gbd_process_version_id (int): GBD Process Version to upload to location_ids (List[int]): location ids used in this run table_type (str): type of DB table to upload into (single or multi year) upload_to_test (bool): determines whether to upload to the test db storage_engine (str, default='INNODB'): either 'COLUMNSTORE' or 'INNODB.' Determines which database to upload to. skip_raise_on (Exception, tuple): will not raise on errors specified. All other exceptions are raised and result in a failed job. """ start_time = time.time() logger.info("START pipeline upload at {}".format(start_time)) # Get process version if upload_to_test: upload_env = DBEnv.DEV else: upload_env = DBEnv.PROD pv = GBDProcessVersion(gbd_process_version_id, env=upload_env) # Get the prefix for file names if pv.gbd_process_id == RISK_GBD_PROCESS_ID: prefix = 'upload_risk' gbd_component = 'risk' elif pv.gbd_process_id == ETI_GBD_PROCESS_ID: prefix = 'upload_eti' gbd_component = 'etiology' elif pv.gbd_process_id == SUMM_GBD_PROCESS_ID: prefix = 'upload_summary' gbd_component = 'summary' output_null_inf_count(out_dir, out_dir, table_type, prefix) # call load data infile on summary files in primary key order if storage_engine == "INNODB": file_df = make_file_df(out_dir, table_type, prefix, location_ids) elif storage_engine == "COLUMNSTORE": file_df = make_file_df_cs(out_dir, table_type, prefix, location_ids) else: logger.info( "Unxepected storage engine type when creating file_df {}".format( storage_engine)) file_df = make_file_df(out_dir, table_type, prefix, location_ids) logger.info("Uploading {} data, {}, time = {}".format( gbd_component, table_type, time.time())) uploader = NatorUpload(out_dir, gbd_process_version_id, gbd_component, table_type, location_ids, upload_to_test, storage_engine=storage_engine) uploader.make_partitions() # make a dictionary where keys are location_id and values # are lists of file paths for that location file_dict = { location_id: file_df.loc[file_df['location_id'] == location_id, 'raw'].tolist() for location_id in uploader.location_ids } uploader.run_all_uploads_mp(file_dict) logger.info("End uploading data, time = {}".format(time.time())) # End log end_time = time.time() elapsed = end_time - start_time logger.info("DONE upload pipeline at {}, elapsed seconds= {}".format( end_time, elapsed)) logger.info("{}".format(SUCCESS_LOG_MESSAGE))