Exemplo n.º 1
0
def save_mapping_to_hdf5():
    global MAPPING_DIR, MEDICINE_MAPPING_PATH, PREP_OUTPUT_DIR, PRESCRIBE_OUTPUT_PATH

    MAPPING_DIR = check_directory(MAPPING_DIR)
    PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR)

    prescribe_output_path = PREP_OUTPUT_DIR + PRESCRIBE_OUTPUT_PATH
    medicine_mapping_path = MAPPING_DIR + MEDICINE_MAPPING_PATH

    if not os.path.isfile(medicine_mapping_path):
        raise ValueError("There is no medicine_mapping dataframe!")

    save_to_hdf5(medicine_mapping_path, prescribe_output_path,
                 'metadata/mapping_table')
Exemplo n.º 2
0
def save_mapping_to_hdf5():
    '''
    mapping table을 hdf5 포맷으로 저장하는 함수
    metadata/mapping_table에 저장
    '''
    global MAPPING_DIR, KCD_MAPPING_PATH, PREP_OUTPUT_DIR, DIAGNOSIS_OUTPUT_PATH

    MAPPING_DIR = check_directory(MAPPING_DIR)
    PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR)

    KCD_output_path = MAPPING_DIR + KCD_MAPPING_PATH
    diagnosis_output_path = PREP_OUTPUT_DIR + DIAGNOSIS_OUTPUT_PATH

    if not os.path.isfile(KCD_output_path):
        raise ValueError("There is no KCD_OUTPUT file!")

    save_to_hdf5(KCD_output_path, diagnosis_output_path,
                 'metadata/mapping_table')
Exemplo n.º 3
0
def run(demographic_path):
    global PREP_OUTPUT_DIR, DEMOGRAPHIC_OUTPUT_PATH, TEMP_PATH

    PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR)
    demographic_output_path = PREP_OUTPUT_DIR + DEMOGRAPHIC_OUTPUT_PATH

    if os.path.isfile(TEMP_PATH):
        raise ValueError("data Corruption WARNING! --> maybe other process using TEMP file ")
        
    demographic_df = pd.read_excel(demographic_path)
    demographic_df.columns = DEMO_COL_NAME
    sex_dict = {'F':1,'M':0}
    demographic_df['sex'] = demographic_df['sex'].map(sex_dict)
    demographic_df['age'] = demographic_df['age'].map(check_age)

    demographic_df.to_csv(TEMP_PATH,sep=DELIM, index=False)

    save_to_hdf5(TEMP_PATH, demographic_output_path, 'data/original')
    os.remove(TEMP_PATH)
Exemplo n.º 4
0
def run(diagnosis_data_path):
    global DELIM, KCD_COL_NAME, KCD_USE_COLS, CHUNK_SIZE, DIAGNOSIS_OUTPUT_PATH, DEBUG_PRINT, TEMP_PATH, PREP_OUTPUT_DIR

    PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR)
    diagnosis_output_path = PREP_OUTPUT_DIR + DIAGNOSIS_OUTPUT_PATH

    KCD_to_code = get_diagnosis_map()  # mapping dictionary

    if os.path.isfile(TEMP_PATH):
        raise ValueError(
            "data Corruption WARNING! --> maybe other process using TEMP file "
        )

    chunks = pd.read_csv(diagnosis_data_path,
                         delimiter=DELIM,
                         header=None,
                         names=KCD_COL_NAME,
                         usecols=KCD_USE_COLS,
                         chunksize=CHUNK_SIZE)
    for idx, chunk in enumerate(chunks):
        #### mapping
        chunk.KCD_code = chunk.KCD_code.map(strip_space)
        chunk.KCD_code = chunk.KCD_code.map(KCD_to_code)
        chunk.date = chunk.date.map(convert_month)

        if idx is 0:
            chunk.to_csv(TEMP_PATH,
                         sep=DELIM,
                         header=KCD_USE_COLS,
                         index=False)
        else:
            chunk.to_csv(TEMP_PATH,
                         sep=DELIM,
                         header=False,
                         index=False,
                         mode='a')

        if DEBUG_PRINT:
            print('{} th chunk of output enters temp file'.format(idx))

    save_to_hdf5(TEMP_PATH, diagnosis_output_path, 'data')
    os.remove(TEMP_PATH)  # temp file remove
    save_mapping_to_hdf5()
Exemplo n.º 5
0
def run(prescribe_lab_path):
    global DELIM, CHUNK_SIZE, PRESCRIBE_OUTPUT_PATH, PREP_OUTPUT_DIR, TEMP_PATH

    PREP_OUTPUT_DIR = check_directory(PREP_OUTPUT_DIR)
    prescribe_output_path = PREP_OUTPUT_DIR + PRESCRIBE_OUTPUT_PATH

    mapping_dict = get_prescribe_map()  # mapping dictionary

    if os.path.isfile(TEMP_PATH):
        raise ValueError(
            "data Corruption WARNING! --> maybe other process using TEMP file "
        )

    chunks = pd.read_csv(prescribe_lab_path,
                         delimiter=DELIM,
                         chunksize=CHUNK_SIZE)
    for idx, chunk in enumerate(chunks):
        #### 임시 코드 start###
        chunk.drop(chunk[chunk.date.map(check_not_date_type)].index,
                   inplace=True)
        chunk.drop(['medi_name', 'date1'], axis=1, inplace=True)
        #### 임시 코드  end###
        chunk['medi_code'] = chunk['medi_code'].map(mapping_dict)
        chunk['date'] = chunk['date'].map(convert_month)
        chunk['times'] = chunk['times'].map(convert_times_per_month)
        if idx is 0:
            chunk.to_csv(TEMP_PATH, sep=DELIM, index=False)
        else:
            chunk.to_csv(TEMP_PATH,
                         sep=DELIM,
                         index=False,
                         header=False,
                         mode='a')
        if DEBUG_PRINT:
            print('{} th chunk of output enters temp file'.format(idx))

    save_to_hdf5(TEMP_PATH, prescribe_output_path, 'data')
    os.remove(TEMP_PATH)  # temp file remove
    save_mapping_to_hdf5()