Пример #1
0
 def create_compare_version(self):
     pv = GBDProcessVersion(self.gbd_process_version_id)
     pv._update_status(gbd_process_version_status.ACTIVE)
     description = f'COMO v{self.como_version_id}'
     cv = CompareVersion.add_new_version(
         gbd_round_id=self.gbd_round_id,
         decomp_step=decomp_step_from_decomp_step_id(self.decomp_step_id),
         compare_version_description=description)
     df = bleeding_edge(cv.compare_version_id)
     cv.add_process_version(df.gbd_process_version_id.tolist())
     cv._update_status(gbd_process_version_status.ACTIVE)
     return cv
Пример #2
0
 def create_gbd_process_version(self, code_version):
     gbd_metadata = {gbd_metadata_type.COMO: self.como_version_id}
     gbd_note = f'COMO v{self.como_version_id}'
     pv = GBDProcessVersion.add_new_version(
         gbd_process_id=gbd_process.EPI,
         gbd_process_version_note=gbd_note,
         code_version=code_version,
         gbd_round_id=self.gbd_round_id,
         decomp_step=decomp_step_from_decomp_step_id(self.decomp_step_id),
         metadata=gbd_metadata)
     self.gbd_process_version_id = pv.gbd_process_version_id
     return pv
Пример #3
0
 def create_compare_version(self, current_compare_version_id,
                            description):
     cur_cv = CompareVersion(current_compare_version_id)
     cv = CompareVersion.add_new_version(
         gbd_round_id=self.gbd_round_id,
         compare_version_description=description)
     for pv in cur_cv.gbd_process_version_ids():
         gbd_process_version = GBDProcessVersion(pv)
         if gbd_process_version.gbd_process_id != 1:
             cv.add_process_version(pv)
     cv.add_process_version(self.gbd_process_version_id)
     cv.mark_best()
     cv.unmark_best()
Пример #4
0
def run_dalynator_upload(out_dir, gbd_process_version_id, location_ids,
                         measure_id, table_type, upload_to_test):
    """
    Upload summary files to GBD Outputs summary tables

    Args:
        out_dir (str): the root directory for this dalynator run
        gbd_process_version_id (int): GBD Process Version to upload to
        location_ids (int): location ids used in this dalynator run
        measure_id (int): measure_id of the upload
        table_type (str): type of DB table to upload into (single or multi year)
        upload_to_test (bool): determines whether to upload to the test db
    """

    start_time = time.time()
    logger.info("START pipeline dalynator upload at {}".format(start_time))

    # Get process version
    if upload_to_test:
        upload_env = DBEnv.DEV
    else:
        upload_env = DBEnv.PROD
    pv = GBDProcessVersion(gbd_process_version_id, env=upload_env)

    # Read in all summary files and save
    summary_data = []
    total_rows = 0
    for location_id in location_ids:
        file_pattern = "FILEPATH.csv".format(
            o=out_dir, l=location_id, m=measure_id, tt=table_type)
        for d in import_data_list_from_file_pattern(file_pattern):
            total_rows += len(d)
            summary_data.append(d)
        if total_rows > 20000000:
            # Concatenate results and save for upload
            concatenate_and_upload(
                summary_data, table_type, pv, out_dir)
            # Reset summary data and row count
            summary_data = []
            total_rows = 0
    if total_rows > 0:
        # Concatenate results and save for upload
        concatenate_and_upload(
            summary_data, table_type, pv, out_dir)
        # Reset summary data and row count
        summary_data = []
        total_rows = 0

    # Upload data
    table_name = "output_summary_{}_v{}".format(
        table_type, pv.gbd_process_version_id)
    logger.info("Uploading data, time = {}".format(time.time()))
    pv.upload_to_table(table_name)
    logger.info("End uploading data, time = {}".format(time.time()))

    # End log
    end_time = time.time()
    elapsed = end_time - start_time
    logger.info("DONE cleanup pipeline at {}, elapsed seconds= {}".format(
        end_time, elapsed))
    logger.info("{}".format(SUCCESS_LOG_MESSAGE))
Пример #5
0
            os.chmod(file, 0o775)

    sesh = get_session(conn_def='gbd', connectable=True)
    infiler = Infiles(table=table_name, schema='gbd', session=sesh)
    file_list = sorted(os.listdir(data_dir))
    for f in file_list:
        print(f)
        infiler.infile(os.path.join(data_dir, f), with_replace=False, commit=True)

# parse args
sev_version_id = int(sys.argv[1])
paf_version_id = int(sys.argv[2])
gbd_round_id = int(sys.argv[3])
decomp_step = sys.argv[4]

# set up process version id
git_hash = subprocess.check_output(['git', '--git-dir=FILEPATH/sev/.git',
    '--work-tree=FILEPATH', 'rev-parse', 'HEAD']).strip()
process_version_id = GBDProcessVersion.add_new_version(
   gbd_process_id=14,
   gbd_process_version_note='SEV v{}'.format(sev_version_id),
   code_version=git_hash,
   gbd_round_id=gbd_round_id,
   decomp_step=decomp_step,
   metadata={2: paf_version_id})

# upload
for table_type in ['single_year', 'multi_year']:
    upload(sev_version_id, process_version_id, table_type)
process_version_id.mark_best()
def run_burdenator_upload(out_dir,
                          gbd_process_version_id,
                          location_ids,
                          measure_id,
                          table_type,
                          upload_to_test,
                          raise_on_error=False):
    """
    Upload summary files to GBD Outputs risk tables

    Args:
        out_dir (str): the root directory for this burdenator run
        gbd_process_version_id (int): GBD Process Version to upload to
        location_id (List[int]): location ids used in this burdenator run
        measure_id (int): measure_id of the upload
        table_type (str): type of DB table to upload into
            (single or multi year)
        upload_to_test (bool): determines whether to upload to the test db
        raise_on_error (bool, False): While infiling, if an exception is
            caught, raise. If False, will just log and continue to next file
    """
    start_time = time.time()
    logger.info("START pipeline burdenator upload at {}".format(start_time))
    # Get process version
    if upload_to_test:
        upload_env = DBEnv.DEV
    else:
        upload_env = DBEnv.PROD

    pv = GBDProcessVersion(gbd_process_version_id, env=upload_env)

    # Get the prefix for file names
    if pv.gbd_process_id == RISK_GBD_PROCESS_ID:
        prefix = 'upload_risk_'
    elif pv.gbd_process_id == ETI_GBD_PROCESS_ID:
        prefix = 'upload_eti_'

    # call load data infile on summary files in primary key order
    # (year/location)
    file_df = make_file_df(out_dir, measure_id, table_type, prefix,
                           location_ids)

    # Upload data
    if pv.gbd_process_id == RISK_GBD_PROCESS_ID:
        table_name = "output_risk_{}_v{}".format(table_type,
                                                 pv.gbd_process_version_id)
    elif pv.gbd_process_id == ETI_GBD_PROCESS_ID:
        table_name = "output_etiology_{}_v{}".format(table_type,
                                                     pv.gbd_process_version_id)
    logger.info("Uploading data to table {}, time = {}".format(
        table_name, time.time()))
    upload_to_table(file_df.raw.tolist(), table_name, upload_env,
                    raise_on_error)
    logger.info("End uploading data, time = {}".format(time.time()))

    # End log
    end_time = time.time()
    elapsed = end_time - start_time
    logger.info("DONE upload pipeline at {}, elapsed seconds= {}".format(
        end_time, elapsed))
    logger.info("{}".format(SUCCESS_LOG_MESSAGE))
Пример #7
0
def run_upload(out_dir,
               gbd_process_version_id,
               location_ids,
               table_type,
               upload_to_test,
               storage_engine='INNODB'):
    """
    Upload summary files to GBD Outputs risk tables

    Args:
        out_dir (str): the root directory for this run
        gbd_process_version_id (int): GBD Process Version to upload to
        location_ids (List[int]): location ids used in this run
        table_type (str): type of DB table to upload into
            (single or multi year)
        upload_to_test (bool): determines whether to upload to the test db
        storage_engine (str, default='INNODB'): either 'COLUMNSTORE' or
            'INNODB.' Determines which database to upload to.
        skip_raise_on (Exception, tuple): will not raise on errors specified.
            All other exceptions are raised and result in a failed job.
    """
    start_time = time.time()
    logger.info("START pipeline upload at {}".format(start_time))
    # Get process version
    if upload_to_test:
        upload_env = DBEnv.DEV
    else:
        upload_env = DBEnv.PROD

    pv = GBDProcessVersion(gbd_process_version_id, env=upload_env)

    # Get the prefix for file names
    if pv.gbd_process_id == RISK_GBD_PROCESS_ID:
        prefix = 'upload_risk'
        gbd_component = 'risk'
    elif pv.gbd_process_id == ETI_GBD_PROCESS_ID:
        prefix = 'upload_eti'
        gbd_component = 'etiology'
    elif pv.gbd_process_id == SUMM_GBD_PROCESS_ID:
        prefix = 'upload_summary'
        gbd_component = 'summary'

    output_null_inf_count(out_dir, out_dir, table_type, prefix)

    # call load data infile on summary files in primary key order
    if storage_engine == "INNODB":
        file_df = make_file_df(out_dir, table_type, prefix, location_ids)
    elif storage_engine == "COLUMNSTORE":
        file_df = make_file_df_cs(out_dir, table_type, prefix, location_ids)
    else:
        logger.info(
            "Unxepected storage engine type when creating file_df {}".format(
                storage_engine))
        file_df = make_file_df(out_dir, table_type, prefix, location_ids)

    logger.info("Uploading {} data, {}, time = {}".format(
        gbd_component, table_type, time.time()))
    uploader = NatorUpload(out_dir,
                           gbd_process_version_id,
                           gbd_component,
                           table_type,
                           location_ids,
                           upload_to_test,
                           storage_engine=storage_engine)
    uploader.make_partitions()
    # make a dictionary where keys are location_id and values
    # are lists of file paths for that location
    file_dict = {
        location_id: file_df.loc[file_df['location_id'] == location_id,
                                 'raw'].tolist()
        for location_id in uploader.location_ids
    }
    uploader.run_all_uploads_mp(file_dict)
    logger.info("End uploading data, time = {}".format(time.time()))

    # End log
    end_time = time.time()
    elapsed = end_time - start_time
    logger.info("DONE upload pipeline at {}, elapsed seconds= {}".format(
        end_time, elapsed))
    logger.info("{}".format(SUCCESS_LOG_MESSAGE))